1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "runtime/biasedLocking.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 void C2_MacroAssembler::setvectmask(Register dst, Register src) { 37 guarantee(PostLoopMultiversioning, "must be"); 38 Assembler::movl(dst, 1); 39 Assembler::shlxl(dst, dst, src); 40 Assembler::decl(dst); 41 Assembler::kmovdl(k1, dst); 42 Assembler::movl(dst, src); 43 } 44 45 void C2_MacroAssembler::restorevectmask() { 46 guarantee(PostLoopMultiversioning, "must be"); 47 Assembler::knotwl(k1, k0); 48 } 49 50 #if INCLUDE_RTM_OPT 51 52 // Update rtm_counters based on abort status 53 // input: abort_status 54 // rtm_counters (RTMLockingCounters*) 55 // flags are killed 56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 57 58 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 59 if (PrintPreciseRTMLockingStatistics) { 60 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 61 Label check_abort; 62 testl(abort_status, (1<<i)); 63 jccb(Assembler::equal, check_abort); 64 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 65 bind(check_abort); 66 } 67 } 68 } 69 70 // Branch if (random & (count-1) != 0), count is 2^n 71 // tmp, scr and flags are killed 72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 73 assert(tmp == rax, ""); 74 assert(scr == rdx, ""); 75 rdtsc(); // modifies EDX:EAX 76 andptr(tmp, count-1); 77 jccb(Assembler::notZero, brLabel); 78 } 79 80 // Perform abort ratio calculation, set no_rtm bit if high ratio 81 // input: rtm_counters_Reg (RTMLockingCounters* address) 82 // tmpReg, rtm_counters_Reg and flags are killed 83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 84 Register rtm_counters_Reg, 85 RTMLockingCounters* rtm_counters, 86 Metadata* method_data) { 87 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 88 89 if (RTMLockingCalculationDelay > 0) { 90 // Delay calculation 91 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 92 testptr(tmpReg, tmpReg); 93 jccb(Assembler::equal, L_done); 94 } 95 // Abort ratio calculation only if abort_count > RTMAbortThreshold 96 // Aborted transactions = abort_count * 100 97 // All transactions = total_count * RTMTotalCountIncrRate 98 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 99 100 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 101 cmpptr(tmpReg, RTMAbortThreshold); 102 jccb(Assembler::below, L_check_always_rtm2); 103 imulptr(tmpReg, tmpReg, 100); 104 105 Register scrReg = rtm_counters_Reg; 106 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 107 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 108 imulptr(scrReg, scrReg, RTMAbortRatio); 109 cmpptr(tmpReg, scrReg); 110 jccb(Assembler::below, L_check_always_rtm1); 111 if (method_data != NULL) { 112 // set rtm_state to "no rtm" in MDO 113 mov_metadata(tmpReg, method_data); 114 lock(); 115 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 116 } 117 jmpb(L_done); 118 bind(L_check_always_rtm1); 119 // Reload RTMLockingCounters* address 120 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 121 bind(L_check_always_rtm2); 122 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 123 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 124 jccb(Assembler::below, L_done); 125 if (method_data != NULL) { 126 // set rtm_state to "always rtm" in MDO 127 mov_metadata(tmpReg, method_data); 128 lock(); 129 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 130 } 131 bind(L_done); 132 } 133 134 // Update counters and perform abort ratio calculation 135 // input: abort_status_Reg 136 // rtm_counters_Reg, flags are killed 137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 138 Register rtm_counters_Reg, 139 RTMLockingCounters* rtm_counters, 140 Metadata* method_data, 141 bool profile_rtm) { 142 143 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 144 // update rtm counters based on rax value at abort 145 // reads abort_status_Reg, updates flags 146 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 147 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 148 if (profile_rtm) { 149 // Save abort status because abort_status_Reg is used by following code. 150 if (RTMRetryCount > 0) { 151 push(abort_status_Reg); 152 } 153 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 154 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 155 // restore abort status 156 if (RTMRetryCount > 0) { 157 pop(abort_status_Reg); 158 } 159 } 160 } 161 162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 163 // inputs: retry_count_Reg 164 // : abort_status_Reg 165 // output: retry_count_Reg decremented by 1 166 // flags are killed 167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 168 Label doneRetry; 169 assert(abort_status_Reg == rax, ""); 170 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 171 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 172 // if reason is in 0x6 and retry count != 0 then retry 173 andptr(abort_status_Reg, 0x6); 174 jccb(Assembler::zero, doneRetry); 175 testl(retry_count_Reg, retry_count_Reg); 176 jccb(Assembler::zero, doneRetry); 177 pause(); 178 decrementl(retry_count_Reg); 179 jmp(retryLabel); 180 bind(doneRetry); 181 } 182 183 // Spin and retry if lock is busy, 184 // inputs: box_Reg (monitor address) 185 // : retry_count_Reg 186 // output: retry_count_Reg decremented by 1 187 // : clear z flag if retry count exceeded 188 // tmp_Reg, scr_Reg, flags are killed 189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 190 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 191 Label SpinLoop, SpinExit, doneRetry; 192 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 193 194 testl(retry_count_Reg, retry_count_Reg); 195 jccb(Assembler::zero, doneRetry); 196 decrementl(retry_count_Reg); 197 movptr(scr_Reg, RTMSpinLoopCount); 198 199 bind(SpinLoop); 200 pause(); 201 decrementl(scr_Reg); 202 jccb(Assembler::lessEqual, SpinExit); 203 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 204 testptr(tmp_Reg, tmp_Reg); 205 jccb(Assembler::notZero, SpinLoop); 206 207 bind(SpinExit); 208 jmp(retryLabel); 209 bind(doneRetry); 210 incrementl(retry_count_Reg); // clear z flag 211 } 212 213 // Use RTM for normal stack locks 214 // Input: objReg (object to lock) 215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 216 Register retry_on_abort_count_Reg, 217 RTMLockingCounters* stack_rtm_counters, 218 Metadata* method_data, bool profile_rtm, 219 Label& DONE_LABEL, Label& IsInflated) { 220 assert(UseRTMForStackLocks, "why call this otherwise?"); 221 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 222 assert(tmpReg == rax, ""); 223 assert(scrReg == rdx, ""); 224 Label L_rtm_retry, L_decrement_retry, L_on_abort; 225 226 if (RTMRetryCount > 0) { 227 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 228 bind(L_rtm_retry); 229 } 230 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 231 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 232 jcc(Assembler::notZero, IsInflated); 233 234 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 235 Label L_noincrement; 236 if (RTMTotalCountIncrRate > 1) { 237 // tmpReg, scrReg and flags are killed 238 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 239 } 240 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 241 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 242 bind(L_noincrement); 243 } 244 xbegin(L_on_abort); 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 246 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 247 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 248 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 249 250 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 251 if (UseRTMXendForLockBusy) { 252 xend(); 253 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 254 jmp(L_decrement_retry); 255 } 256 else { 257 xabort(0); 258 } 259 bind(L_on_abort); 260 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 261 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 262 } 263 bind(L_decrement_retry); 264 if (RTMRetryCount > 0) { 265 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 266 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 267 } 268 } 269 270 // Use RTM for inflating locks 271 // inputs: objReg (object to lock) 272 // boxReg (on-stack box address (displaced header location) - KILLED) 273 // tmpReg (ObjectMonitor address + markWord::monitor_value) 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 275 Register scrReg, Register retry_on_busy_count_Reg, 276 Register retry_on_abort_count_Reg, 277 RTMLockingCounters* rtm_counters, 278 Metadata* method_data, bool profile_rtm, 279 Label& DONE_LABEL) { 280 assert(UseRTMLocking, "why call this otherwise?"); 281 assert(tmpReg == rax, ""); 282 assert(scrReg == rdx, ""); 283 Label L_rtm_retry, L_decrement_retry, L_on_abort; 284 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 285 286 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 287 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 288 movptr(boxReg, tmpReg); // Save ObjectMonitor address 289 290 if (RTMRetryCount > 0) { 291 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 292 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 293 bind(L_rtm_retry); 294 } 295 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 296 Label L_noincrement; 297 if (RTMTotalCountIncrRate > 1) { 298 // tmpReg, scrReg and flags are killed 299 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 300 } 301 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 302 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 303 bind(L_noincrement); 304 } 305 xbegin(L_on_abort); 306 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 307 movptr(tmpReg, Address(tmpReg, owner_offset)); 308 testptr(tmpReg, tmpReg); 309 jcc(Assembler::zero, DONE_LABEL); 310 if (UseRTMXendForLockBusy) { 311 xend(); 312 jmp(L_decrement_retry); 313 } 314 else { 315 xabort(0); 316 } 317 bind(L_on_abort); 318 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 319 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 320 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 321 } 322 if (RTMRetryCount > 0) { 323 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 324 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 325 } 326 327 movptr(tmpReg, Address(boxReg, owner_offset)) ; 328 testptr(tmpReg, tmpReg) ; 329 jccb(Assembler::notZero, L_decrement_retry) ; 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 333 #ifdef _LP64 334 Register threadReg = r15_thread; 335 #else 336 get_thread(scrReg); 337 Register threadReg = scrReg; 338 #endif 339 lock(); 340 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 341 342 if (RTMRetryCount > 0) { 343 // success done else retry 344 jccb(Assembler::equal, DONE_LABEL) ; 345 bind(L_decrement_retry); 346 // Spin and retry if lock is busy. 347 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 348 } 349 else { 350 bind(L_decrement_retry); 351 } 352 } 353 354 #endif // INCLUDE_RTM_OPT 355 356 // fast_lock and fast_unlock used by C2 357 358 // Because the transitions from emitted code to the runtime 359 // monitorenter/exit helper stubs are so slow it's critical that 360 // we inline both the stack-locking fast path and the inflated fast path. 361 // 362 // See also: cmpFastLock and cmpFastUnlock. 363 // 364 // What follows is a specialized inline transliteration of the code 365 // in enter() and exit(). If we're concerned about I$ bloat another 366 // option would be to emit TrySlowEnter and TrySlowExit methods 367 // at startup-time. These methods would accept arguments as 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 369 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 371 // In practice, however, the # of lock sites is bounded and is usually small. 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 373 // if the processor uses simple bimodal branch predictors keyed by EIP 374 // Since the helper routines would be called from multiple synchronization 375 // sites. 376 // 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 379 // to those specialized methods. That'd give us a mostly platform-independent 380 // implementation that the JITs could optimize and inline at their pleasure. 381 // Done correctly, the only time we'd need to cross to native could would be 382 // to park() or unpark() threads. We'd also need a few more unsafe operators 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 384 // (b) explicit barriers or fence operations. 385 // 386 // TODO: 387 // 388 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 389 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 390 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 391 // the lock operators would typically be faster than reifying Self. 392 // 393 // * Ideally I'd define the primitives as: 394 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 395 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 396 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 397 // Instead, we're stuck with a rather awkward and brittle register assignments below. 398 // Furthermore the register assignments are overconstrained, possibly resulting in 399 // sub-optimal code near the synchronization site. 400 // 401 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 402 // Alternately, use a better sp-proximity test. 403 // 404 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 405 // Either one is sufficient to uniquely identify a thread. 406 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 407 // 408 // * Intrinsify notify() and notifyAll() for the common cases where the 409 // object is locked by the calling thread but the waitlist is empty. 410 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 411 // 412 // * use jccb and jmpb instead of jcc and jmp to improve code density. 413 // But beware of excessive branch density on AMD Opterons. 414 // 415 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 416 // or failure of the fast path. If the fast path fails then we pass 417 // control to the slow path, typically in C. In fast_lock and 418 // fast_unlock we often branch to DONE_LABEL, just to find that C2 419 // will emit a conditional branch immediately after the node. 420 // So we have branches to branches and lots of ICC.ZF games. 421 // Instead, it might be better to have C2 pass a "FailureLabel" 422 // into fast_lock and fast_unlock. In the case of success, control 423 // will drop through the node. ICC.ZF is undefined at exit. 424 // In the case of failure, the node will branch directly to the 425 // FailureLabel 426 427 428 // obj: object to lock 429 // box: on-stack box address (displaced header location) - KILLED 430 // rax,: tmp -- KILLED 431 // scr: tmp -- KILLED 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 433 Register scrReg, Register cx1Reg, Register cx2Reg, 434 BiasedLockingCounters* counters, 435 RTMLockingCounters* rtm_counters, 436 RTMLockingCounters* stack_rtm_counters, 437 Metadata* method_data, 438 bool use_rtm, bool profile_rtm) { 439 // Ensure the register assignments are disjoint 440 assert(tmpReg == rax, ""); 441 442 if (use_rtm) { 443 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 444 } else { 445 assert(cx2Reg == noreg, ""); 446 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 447 } 448 449 if (counters != NULL) { 450 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 451 } 452 453 // Possible cases that we'll encounter in fast_lock 454 // ------------------------------------------------ 455 // * Inflated 456 // -- unlocked 457 // -- Locked 458 // = by self 459 // = by other 460 // * biased 461 // -- by Self 462 // -- by other 463 // * neutral 464 // * stack-locked 465 // -- by self 466 // = sp-proximity test hits 467 // = sp-proximity test generates false-negative 468 // -- by other 469 // 470 471 Label IsInflated, DONE_LABEL; 472 473 // it's stack-locked, biased or neutral 474 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 475 // order to reduce the number of conditional branches in the most common cases. 476 // Beware -- there's a subtle invariant that fetch of the markword 477 // at [FETCH], below, will never observe a biased encoding (*101b). 478 // If this invariant is not held we risk exclusion (safety) failure. 479 if (UseBiasedLocking && !UseOptoBiasInlining) { 480 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 481 } 482 483 #if INCLUDE_RTM_OPT 484 if (UseRTMForStackLocks && use_rtm) { 485 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 486 stack_rtm_counters, method_data, profile_rtm, 487 DONE_LABEL, IsInflated); 488 } 489 #endif // INCLUDE_RTM_OPT 490 491 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 492 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 493 jccb(Assembler::notZero, IsInflated); 494 495 // Attempt stack-locking ... 496 orptr (tmpReg, markWord::unlocked_value); 497 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 498 lock(); 499 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 500 if (counters != NULL) { 501 cond_inc32(Assembler::equal, 502 ExternalAddress((address)counters->fast_path_entry_count_addr())); 503 } 504 jcc(Assembler::equal, DONE_LABEL); // Success 505 506 // Recursive locking. 507 // The object is stack-locked: markword contains stack pointer to BasicLock. 508 // Locked by current thread if difference with current SP is less than one page. 509 subptr(tmpReg, rsp); 510 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 511 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 512 movptr(Address(boxReg, 0), tmpReg); 513 if (counters != NULL) { 514 cond_inc32(Assembler::equal, 515 ExternalAddress((address)counters->fast_path_entry_count_addr())); 516 } 517 jmp(DONE_LABEL); 518 519 bind(IsInflated); 520 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 521 522 #if INCLUDE_RTM_OPT 523 // Use the same RTM locking code in 32- and 64-bit VM. 524 if (use_rtm) { 525 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 526 rtm_counters, method_data, profile_rtm, DONE_LABEL); 527 } else { 528 #endif // INCLUDE_RTM_OPT 529 530 #ifndef _LP64 531 // The object is inflated. 532 533 // boxReg refers to the on-stack BasicLock in the current frame. 534 // We'd like to write: 535 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 536 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 537 // additional latency as we have another ST in the store buffer that must drain. 538 539 // avoid ST-before-CAS 540 // register juggle because we need tmpReg for cmpxchgptr below 541 movptr(scrReg, boxReg); 542 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 543 544 // Optimistic form: consider XORL tmpReg,tmpReg 545 movptr(tmpReg, NULL_WORD); 546 547 // Appears unlocked - try to swing _owner from null to non-null. 548 // Ideally, I'd manifest "Self" with get_thread and then attempt 549 // to CAS the register containing Self into m->Owner. 550 // But we don't have enough registers, so instead we can either try to CAS 551 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 552 // we later store "Self" into m->Owner. Transiently storing a stack address 553 // (rsp or the address of the box) into m->owner is harmless. 554 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 555 lock(); 556 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 557 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 558 // If we weren't able to swing _owner from NULL to the BasicLock 559 // then take the slow path. 560 jccb (Assembler::notZero, DONE_LABEL); 561 // update _owner from BasicLock to thread 562 get_thread (scrReg); // beware: clobbers ICCs 563 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 564 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 565 566 // If the CAS fails we can either retry or pass control to the slow path. 567 // We use the latter tactic. 568 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 569 // If the CAS was successful ... 570 // Self has acquired the lock 571 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 572 // Intentional fall-through into DONE_LABEL ... 573 #else // _LP64 574 // It's inflated and we use scrReg for ObjectMonitor* in this section. 575 movq(scrReg, tmpReg); 576 xorq(tmpReg, tmpReg); 577 lock(); 578 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 579 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 580 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 581 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 582 // Intentional fall-through into DONE_LABEL ... 583 // Propagate ICC.ZF from CAS above into DONE_LABEL. 584 #endif // _LP64 585 #if INCLUDE_RTM_OPT 586 } // use_rtm() 587 #endif 588 // DONE_LABEL is a hot target - we'd really like to place it at the 589 // start of cache line by padding with NOPs. 590 // See the AMD and Intel software optimization manuals for the 591 // most efficient "long" NOP encodings. 592 // Unfortunately none of our alignment mechanisms suffice. 593 bind(DONE_LABEL); 594 595 // At DONE_LABEL the icc ZFlag is set as follows ... 596 // fast_unlock uses the same protocol. 597 // ZFlag == 1 -> Success 598 // ZFlag == 0 -> Failure - force control through the slow path 599 } 600 601 // obj: object to unlock 602 // box: box address (displaced header location), killed. Must be EAX. 603 // tmp: killed, cannot be obj nor box. 604 // 605 // Some commentary on balanced locking: 606 // 607 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 608 // Methods that don't have provably balanced locking are forced to run in the 609 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 610 // The interpreter provides two properties: 611 // I1: At return-time the interpreter automatically and quietly unlocks any 612 // objects acquired the current activation (frame). Recall that the 613 // interpreter maintains an on-stack list of locks currently held by 614 // a frame. 615 // I2: If a method attempts to unlock an object that is not held by the 616 // the frame the interpreter throws IMSX. 617 // 618 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 619 // B() doesn't have provably balanced locking so it runs in the interpreter. 620 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 621 // is still locked by A(). 622 // 623 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 624 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 625 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 626 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 627 // Arguably given that the spec legislates the JNI case as undefined our implementation 628 // could reasonably *avoid* checking owner in fast_unlock(). 629 // In the interest of performance we elide m->Owner==Self check in unlock. 630 // A perfectly viable alternative is to elide the owner check except when 631 // Xcheck:jni is enabled. 632 633 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 634 assert(boxReg == rax, ""); 635 assert_different_registers(objReg, boxReg, tmpReg); 636 637 Label DONE_LABEL, Stacked, CheckSucc; 638 639 // Critically, the biased locking test must have precedence over 640 // and appear before the (box->dhw == 0) recursive stack-lock test. 641 if (UseBiasedLocking && !UseOptoBiasInlining) { 642 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 643 } 644 645 #if INCLUDE_RTM_OPT 646 if (UseRTMForStackLocks && use_rtm) { 647 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 648 Label L_regular_unlock; 649 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 650 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 651 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 652 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 653 xend(); // otherwise end... 654 jmp(DONE_LABEL); // ... and we're done 655 bind(L_regular_unlock); 656 } 657 #endif 658 659 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 660 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 661 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 662 testptr(tmpReg, markWord::monitor_value); // Inflated? 663 jccb (Assembler::zero, Stacked); 664 665 // It's inflated. 666 #if INCLUDE_RTM_OPT 667 if (use_rtm) { 668 Label L_regular_inflated_unlock; 669 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 670 movptr(boxReg, Address(tmpReg, owner_offset)); 671 testptr(boxReg, boxReg); 672 jccb(Assembler::notZero, L_regular_inflated_unlock); 673 xend(); 674 jmpb(DONE_LABEL); 675 bind(L_regular_inflated_unlock); 676 } 677 #endif 678 679 // Despite our balanced locking property we still check that m->_owner == Self 680 // as java routines or native JNI code called by this thread might 681 // have released the lock. 682 // Refer to the comments in synchronizer.cpp for how we might encode extra 683 // state in _succ so we can avoid fetching EntryList|cxq. 684 // 685 // I'd like to add more cases in fast_lock() and fast_unlock() -- 686 // such as recursive enter and exit -- but we have to be wary of 687 // I$ bloat, T$ effects and BP$ effects. 688 // 689 // If there's no contention try a 1-0 exit. That is, exit without 690 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 691 // we detect and recover from the race that the 1-0 exit admits. 692 // 693 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 694 // before it STs null into _owner, releasing the lock. Updates 695 // to data protected by the critical section must be visible before 696 // we drop the lock (and thus before any other thread could acquire 697 // the lock and observe the fields protected by the lock). 698 // IA32's memory-model is SPO, so STs are ordered with respect to 699 // each other and there's no need for an explicit barrier (fence). 700 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 701 #ifndef _LP64 702 get_thread (boxReg); 703 704 // Note that we could employ various encoding schemes to reduce 705 // the number of loads below (currently 4) to just 2 or 3. 706 // Refer to the comments in synchronizer.cpp. 707 // In practice the chain of fetches doesn't seem to impact performance, however. 708 xorptr(boxReg, boxReg); 709 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 710 jccb (Assembler::notZero, DONE_LABEL); 711 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 712 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 713 jccb (Assembler::notZero, CheckSucc); 714 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 715 jmpb (DONE_LABEL); 716 717 bind (Stacked); 718 // It's not inflated and it's not recursively stack-locked and it's not biased. 719 // It must be stack-locked. 720 // Try to reset the header to displaced header. 721 // The "box" value on the stack is stable, so we can reload 722 // and be assured we observe the same value as above. 723 movptr(tmpReg, Address(boxReg, 0)); 724 lock(); 725 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 726 // Intention fall-thru into DONE_LABEL 727 728 // DONE_LABEL is a hot target - we'd really like to place it at the 729 // start of cache line by padding with NOPs. 730 // See the AMD and Intel software optimization manuals for the 731 // most efficient "long" NOP encodings. 732 // Unfortunately none of our alignment mechanisms suffice. 733 bind (CheckSucc); 734 #else // _LP64 735 // It's inflated 736 xorptr(boxReg, boxReg); 737 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 738 jccb (Assembler::notZero, DONE_LABEL); 739 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 740 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 741 jccb (Assembler::notZero, CheckSucc); 742 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 743 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 744 jmpb (DONE_LABEL); 745 746 // Try to avoid passing control into the slow_path ... 747 Label LSuccess, LGoSlowPath ; 748 bind (CheckSucc); 749 750 // The following optional optimization can be elided if necessary 751 // Effectively: if (succ == null) goto slow path 752 // The code reduces the window for a race, however, 753 // and thus benefits performance. 754 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 755 jccb (Assembler::zero, LGoSlowPath); 756 757 xorptr(boxReg, boxReg); 758 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 759 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 760 761 // Memory barrier/fence 762 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 763 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 764 // This is faster on Nehalem and AMD Shanghai/Barcelona. 765 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 766 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 767 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 768 lock(); addl(Address(rsp, 0), 0); 769 770 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 771 jccb (Assembler::notZero, LSuccess); 772 773 // Rare inopportune interleaving - race. 774 // The successor vanished in the small window above. 775 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 776 // We need to ensure progress and succession. 777 // Try to reacquire the lock. 778 // If that fails then the new owner is responsible for succession and this 779 // thread needs to take no further action and can exit via the fast path (success). 780 // If the re-acquire succeeds then pass control into the slow path. 781 // As implemented, this latter mode is horrible because we generated more 782 // coherence traffic on the lock *and* artifically extended the critical section 783 // length while by virtue of passing control into the slow path. 784 785 // box is really RAX -- the following CMPXCHG depends on that binding 786 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 787 lock(); 788 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 789 // There's no successor so we tried to regrab the lock. 790 // If that didn't work, then another thread grabbed the 791 // lock so we're done (and exit was a success). 792 jccb (Assembler::notEqual, LSuccess); 793 // Intentional fall-through into slow path 794 795 bind (LGoSlowPath); 796 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 797 jmpb (DONE_LABEL); 798 799 bind (LSuccess); 800 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 801 jmpb (DONE_LABEL); 802 803 bind (Stacked); 804 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 805 lock(); 806 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 807 808 #endif 809 bind(DONE_LABEL); 810 } 811 812 //------------------------------------------------------------------------------------------- 813 // Generic instructions support for use in .ad files C2 code generation 814 815 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 816 if (dst != src) { 817 movdqu(dst, src); 818 } 819 if (opcode == Op_AbsVD) { 820 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 821 } else { 822 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 823 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 824 } 825 } 826 827 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 828 if (opcode == Op_AbsVD) { 829 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 830 } else { 831 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 832 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 833 } 834 } 835 836 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 837 if (dst != src) { 838 movdqu(dst, src); 839 } 840 if (opcode == Op_AbsVF) { 841 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 842 } else { 843 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 844 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 845 } 846 } 847 848 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 849 if (opcode == Op_AbsVF) { 850 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 851 } else { 852 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 853 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 854 } 855 } 856 857 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 858 if (sign) { 859 pmovsxbw(dst, src); 860 } else { 861 pmovzxbw(dst, src); 862 } 863 } 864 865 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 866 if (sign) { 867 vpmovsxbw(dst, src, vector_len); 868 } else { 869 vpmovzxbw(dst, src, vector_len); 870 } 871 } 872 873 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { 874 if (opcode == Op_RShiftVI) { 875 psrad(dst, src); 876 } else if (opcode == Op_LShiftVI) { 877 pslld(dst, src); 878 } else { 879 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 880 psrld(dst, src); 881 } 882 } 883 884 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 885 if (opcode == Op_RShiftVI) { 886 vpsrad(dst, nds, src, vector_len); 887 } else if (opcode == Op_LShiftVI) { 888 vpslld(dst, nds, src, vector_len); 889 } else { 890 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 891 vpsrld(dst, nds, src, vector_len); 892 } 893 } 894 895 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { 896 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { 897 psraw(dst, src); 898 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { 899 psllw(dst, src); 900 } else { 901 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); 902 psrlw(dst, src); 903 } 904 } 905 906 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 907 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { 908 vpsraw(dst, nds, src, vector_len); 909 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { 910 vpsllw(dst, nds, src, vector_len); 911 } else { 912 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); 913 vpsrlw(dst, nds, src, vector_len); 914 } 915 } 916 917 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { 918 if (opcode == Op_RShiftVL) { 919 psrlq(dst, src); // using srl to implement sra on pre-avs512 systems 920 } else if (opcode == Op_LShiftVL) { 921 psllq(dst, src); 922 } else { 923 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 924 psrlq(dst, src); 925 } 926 } 927 928 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 929 if (opcode == Op_RShiftVL) { 930 evpsraq(dst, nds, src, vector_len); 931 } else if (opcode == Op_LShiftVL) { 932 vpsllq(dst, nds, src, vector_len); 933 } else { 934 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 935 vpsrlq(dst, nds, src, vector_len); 936 } 937 } 938 939 // Reductions for vectors of ints, longs, floats, and doubles. 940 941 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) { 942 int vector_len = Assembler::AVX_128bit; 943 944 switch (opcode) { 945 case Op_AndReductionV: pand(dst, src); break; 946 case Op_OrReductionV: por (dst, src); break; 947 case Op_XorReductionV: pxor(dst, src); break; 948 949 case Op_AddReductionVF: addss(dst, src); break; 950 case Op_AddReductionVD: addsd(dst, src); break; 951 case Op_AddReductionVI: paddd(dst, src); break; 952 case Op_AddReductionVL: paddq(dst, src); break; 953 954 case Op_MulReductionVF: mulss(dst, src); break; 955 case Op_MulReductionVD: mulsd(dst, src); break; 956 case Op_MulReductionVI: pmulld(dst, src); break; 957 case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break; 958 959 default: assert(false, "wrong opcode"); 960 } 961 } 962 963 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 964 int vector_len = Assembler::AVX_256bit; 965 966 switch (opcode) { 967 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 968 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 969 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 970 971 case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break; 972 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 973 974 case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break; 975 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 976 977 default: assert(false, "wrong opcode"); 978 } 979 } 980 981 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 982 XMMRegister dst, XMMRegister src, 983 XMMRegister vtmp1, XMMRegister vtmp2) { 984 switch (opcode) { 985 case Op_AddReductionVF: 986 case Op_MulReductionVF: 987 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 988 break; 989 990 case Op_AddReductionVD: 991 case Op_MulReductionVD: 992 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 993 break; 994 995 default: assert(false, "wrong opcode"); 996 } 997 } 998 999 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1000 Register dst, Register src1, XMMRegister src2, 1001 XMMRegister vtmp1, XMMRegister vtmp2) { 1002 switch (vlen) { 1003 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1004 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1005 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1006 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1007 1008 default: assert(false, "wrong vector length"); 1009 } 1010 } 1011 1012 #ifdef _LP64 1013 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1014 Register dst, Register src1, XMMRegister src2, 1015 XMMRegister vtmp1, XMMRegister vtmp2) { 1016 switch (vlen) { 1017 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1018 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1019 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1020 1021 default: assert(false, "wrong vector length"); 1022 } 1023 } 1024 #endif // _LP64 1025 1026 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1027 switch (vlen) { 1028 case 2: 1029 assert(vtmp2 == xnoreg, ""); 1030 reduce2F(opcode, dst, src, vtmp1); 1031 break; 1032 case 4: 1033 assert(vtmp2 == xnoreg, ""); 1034 reduce4F(opcode, dst, src, vtmp1); 1035 break; 1036 case 8: 1037 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1038 break; 1039 case 16: 1040 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1041 break; 1042 default: assert(false, "wrong vector length"); 1043 } 1044 } 1045 1046 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1047 switch (vlen) { 1048 case 2: 1049 assert(vtmp2 == xnoreg, ""); 1050 reduce2D(opcode, dst, src, vtmp1); 1051 break; 1052 case 4: 1053 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1054 break; 1055 case 8: 1056 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1057 break; 1058 default: assert(false, "wrong vector length"); 1059 } 1060 } 1061 1062 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1063 if (opcode == Op_AddReductionVI) { 1064 if (vtmp1 != src2) { 1065 movdqu(vtmp1, src2); 1066 } 1067 phaddd(vtmp1, vtmp1); 1068 } else { 1069 pshufd(vtmp1, src2, 0x1); 1070 reduce_operation_128(opcode, vtmp1, src2); 1071 } 1072 movdl(vtmp2, src1); 1073 reduce_operation_128(opcode, vtmp1, vtmp2); 1074 movdl(dst, vtmp1); 1075 } 1076 1077 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1078 if (opcode == Op_AddReductionVI) { 1079 if (vtmp1 != src2) { 1080 movdqu(vtmp1, src2); 1081 } 1082 phaddd(vtmp1, src2); 1083 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1084 } else { 1085 pshufd(vtmp2, src2, 0xE); 1086 reduce_operation_128(opcode, vtmp2, src2); 1087 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1088 } 1089 } 1090 1091 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1092 if (opcode == Op_AddReductionVI) { 1093 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1094 vextracti128_high(vtmp2, vtmp1); 1095 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1096 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1097 } else { 1098 vextracti128_high(vtmp1, src2); 1099 reduce_operation_128(opcode, vtmp1, src2); 1100 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1101 } 1102 } 1103 1104 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1105 vextracti64x4_high(vtmp2, src2); 1106 reduce_operation_256(opcode, vtmp2, vtmp2, src2); 1107 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1108 } 1109 1110 #ifdef _LP64 1111 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1112 pshufd(vtmp2, src2, 0xE); 1113 reduce_operation_128(opcode, vtmp2, src2); 1114 movdq(vtmp1, src1); 1115 reduce_operation_128(opcode, vtmp1, vtmp2); 1116 movdq(dst, vtmp1); 1117 } 1118 1119 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1120 vextracti128_high(vtmp1, src2); 1121 reduce_operation_128(opcode, vtmp1, src2); 1122 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1123 } 1124 1125 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1126 vextracti64x4_high(vtmp2, src2); 1127 reduce_operation_256(opcode, vtmp2, vtmp2, src2); 1128 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1129 } 1130 #endif // _LP64 1131 1132 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1133 reduce_operation_128(opcode, dst, src); 1134 pshufd(vtmp, src, 0x1); 1135 reduce_operation_128(opcode, dst, vtmp); 1136 } 1137 1138 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1139 reduce2F(opcode, dst, src, vtmp); 1140 pshufd(vtmp, src, 0x2); 1141 reduce_operation_128(opcode, dst, vtmp); 1142 pshufd(vtmp, src, 0x3); 1143 reduce_operation_128(opcode, dst, vtmp); 1144 } 1145 1146 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1147 reduce4F(opcode, dst, src, vtmp2); 1148 vextractf128_high(vtmp2, src); 1149 reduce4F(opcode, dst, vtmp2, vtmp1); 1150 } 1151 1152 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1153 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1154 vextracti64x4_high(vtmp1, src); 1155 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1156 } 1157 1158 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1159 reduce_operation_128(opcode, dst, src); 1160 pshufd(vtmp, src, 0xE); 1161 reduce_operation_128(opcode, dst, vtmp); 1162 } 1163 1164 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1165 reduce2D(opcode, dst, src, vtmp2); 1166 vextractf128_high(vtmp2, src); 1167 reduce2D(opcode, dst, vtmp2, vtmp1); 1168 } 1169 1170 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1171 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1172 vextracti64x4_high(vtmp1, src); 1173 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1174 } 1175 1176 //------------------------------------------------------------------------------------------- 1177 1178 // IndexOf for constant substrings with size >= 8 chars 1179 // which don't need to be loaded through stack. 1180 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 1181 Register cnt1, Register cnt2, 1182 int int_cnt2, Register result, 1183 XMMRegister vec, Register tmp, 1184 int ae) { 1185 ShortBranchVerifier sbv(this); 1186 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1187 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1188 1189 // This method uses the pcmpestri instruction with bound registers 1190 // inputs: 1191 // xmm - substring 1192 // rax - substring length (elements count) 1193 // mem - scanned string 1194 // rdx - string length (elements count) 1195 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 1196 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 1197 // outputs: 1198 // rcx - matched index in string 1199 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1200 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 1201 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 1202 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 1203 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 1204 1205 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 1206 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 1207 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 1208 1209 // Note, inline_string_indexOf() generates checks: 1210 // if (substr.count > string.count) return -1; 1211 // if (substr.count == 0) return 0; 1212 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 1213 1214 // Load substring. 1215 if (ae == StrIntrinsicNode::UL) { 1216 pmovzxbw(vec, Address(str2, 0)); 1217 } else { 1218 movdqu(vec, Address(str2, 0)); 1219 } 1220 movl(cnt2, int_cnt2); 1221 movptr(result, str1); // string addr 1222 1223 if (int_cnt2 > stride) { 1224 jmpb(SCAN_TO_SUBSTR); 1225 1226 // Reload substr for rescan, this code 1227 // is executed only for large substrings (> 8 chars) 1228 bind(RELOAD_SUBSTR); 1229 if (ae == StrIntrinsicNode::UL) { 1230 pmovzxbw(vec, Address(str2, 0)); 1231 } else { 1232 movdqu(vec, Address(str2, 0)); 1233 } 1234 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 1235 1236 bind(RELOAD_STR); 1237 // We came here after the beginning of the substring was 1238 // matched but the rest of it was not so we need to search 1239 // again. Start from the next element after the previous match. 1240 1241 // cnt2 is number of substring reminding elements and 1242 // cnt1 is number of string reminding elements when cmp failed. 1243 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 1244 subl(cnt1, cnt2); 1245 addl(cnt1, int_cnt2); 1246 movl(cnt2, int_cnt2); // Now restore cnt2 1247 1248 decrementl(cnt1); // Shift to next element 1249 cmpl(cnt1, cnt2); 1250 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1251 1252 addptr(result, (1<<scale1)); 1253 1254 } // (int_cnt2 > 8) 1255 1256 // Scan string for start of substr in 16-byte vectors 1257 bind(SCAN_TO_SUBSTR); 1258 pcmpestri(vec, Address(result, 0), mode); 1259 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 1260 subl(cnt1, stride); 1261 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 1262 cmpl(cnt1, cnt2); 1263 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1264 addptr(result, 16); 1265 jmpb(SCAN_TO_SUBSTR); 1266 1267 // Found a potential substr 1268 bind(FOUND_CANDIDATE); 1269 // Matched whole vector if first element matched (tmp(rcx) == 0). 1270 if (int_cnt2 == stride) { 1271 jccb(Assembler::overflow, RET_FOUND); // OF == 1 1272 } else { // int_cnt2 > 8 1273 jccb(Assembler::overflow, FOUND_SUBSTR); 1274 } 1275 // After pcmpestri tmp(rcx) contains matched element index 1276 // Compute start addr of substr 1277 lea(result, Address(result, tmp, scale1)); 1278 1279 // Make sure string is still long enough 1280 subl(cnt1, tmp); 1281 cmpl(cnt1, cnt2); 1282 if (int_cnt2 == stride) { 1283 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 1284 } else { // int_cnt2 > 8 1285 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 1286 } 1287 // Left less then substring. 1288 1289 bind(RET_NOT_FOUND); 1290 movl(result, -1); 1291 jmp(EXIT); 1292 1293 if (int_cnt2 > stride) { 1294 // This code is optimized for the case when whole substring 1295 // is matched if its head is matched. 1296 bind(MATCH_SUBSTR_HEAD); 1297 pcmpestri(vec, Address(result, 0), mode); 1298 // Reload only string if does not match 1299 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 1300 1301 Label CONT_SCAN_SUBSTR; 1302 // Compare the rest of substring (> 8 chars). 1303 bind(FOUND_SUBSTR); 1304 // First 8 chars are already matched. 1305 negptr(cnt2); 1306 addptr(cnt2, stride); 1307 1308 bind(SCAN_SUBSTR); 1309 subl(cnt1, stride); 1310 cmpl(cnt2, -stride); // Do not read beyond substring 1311 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 1312 // Back-up strings to avoid reading beyond substring: 1313 // cnt1 = cnt1 - cnt2 + 8 1314 addl(cnt1, cnt2); // cnt2 is negative 1315 addl(cnt1, stride); 1316 movl(cnt2, stride); negptr(cnt2); 1317 bind(CONT_SCAN_SUBSTR); 1318 if (int_cnt2 < (int)G) { 1319 int tail_off1 = int_cnt2<<scale1; 1320 int tail_off2 = int_cnt2<<scale2; 1321 if (ae == StrIntrinsicNode::UL) { 1322 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 1323 } else { 1324 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 1325 } 1326 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 1327 } else { 1328 // calculate index in register to avoid integer overflow (int_cnt2*2) 1329 movl(tmp, int_cnt2); 1330 addptr(tmp, cnt2); 1331 if (ae == StrIntrinsicNode::UL) { 1332 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 1333 } else { 1334 movdqu(vec, Address(str2, tmp, scale2, 0)); 1335 } 1336 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 1337 } 1338 // Need to reload strings pointers if not matched whole vector 1339 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 1340 addptr(cnt2, stride); 1341 jcc(Assembler::negative, SCAN_SUBSTR); 1342 // Fall through if found full substring 1343 1344 } // (int_cnt2 > 8) 1345 1346 bind(RET_FOUND); 1347 // Found result if we matched full small substring. 1348 // Compute substr offset 1349 subptr(result, str1); 1350 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1351 shrl(result, 1); // index 1352 } 1353 bind(EXIT); 1354 1355 } // string_indexofC8 1356 1357 // Small strings are loaded through stack if they cross page boundary. 1358 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 1359 Register cnt1, Register cnt2, 1360 int int_cnt2, Register result, 1361 XMMRegister vec, Register tmp, 1362 int ae) { 1363 ShortBranchVerifier sbv(this); 1364 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1365 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1366 1367 // 1368 // int_cnt2 is length of small (< 8 chars) constant substring 1369 // or (-1) for non constant substring in which case its length 1370 // is in cnt2 register. 1371 // 1372 // Note, inline_string_indexOf() generates checks: 1373 // if (substr.count > string.count) return -1; 1374 // if (substr.count == 0) return 0; 1375 // 1376 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 1377 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 1378 // This method uses the pcmpestri instruction with bound registers 1379 // inputs: 1380 // xmm - substring 1381 // rax - substring length (elements count) 1382 // mem - scanned string 1383 // rdx - string length (elements count) 1384 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 1385 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 1386 // outputs: 1387 // rcx - matched index in string 1388 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1389 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 1390 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 1391 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 1392 1393 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 1394 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 1395 FOUND_CANDIDATE; 1396 1397 { //======================================================== 1398 // We don't know where these strings are located 1399 // and we can't read beyond them. Load them through stack. 1400 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 1401 1402 movptr(tmp, rsp); // save old SP 1403 1404 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 1405 if (int_cnt2 == (1>>scale2)) { // One byte 1406 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 1407 load_unsigned_byte(result, Address(str2, 0)); 1408 movdl(vec, result); // move 32 bits 1409 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 1410 // Not enough header space in 32-bit VM: 12+3 = 15. 1411 movl(result, Address(str2, -1)); 1412 shrl(result, 8); 1413 movdl(vec, result); // move 32 bits 1414 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 1415 load_unsigned_short(result, Address(str2, 0)); 1416 movdl(vec, result); // move 32 bits 1417 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 1418 movdl(vec, Address(str2, 0)); // move 32 bits 1419 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 1420 movq(vec, Address(str2, 0)); // move 64 bits 1421 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 1422 // Array header size is 12 bytes in 32-bit VM 1423 // + 6 bytes for 3 chars == 18 bytes, 1424 // enough space to load vec and shift. 1425 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 1426 if (ae == StrIntrinsicNode::UL) { 1427 int tail_off = int_cnt2-8; 1428 pmovzxbw(vec, Address(str2, tail_off)); 1429 psrldq(vec, -2*tail_off); 1430 } 1431 else { 1432 int tail_off = int_cnt2*(1<<scale2); 1433 movdqu(vec, Address(str2, tail_off-16)); 1434 psrldq(vec, 16-tail_off); 1435 } 1436 } 1437 } else { // not constant substring 1438 cmpl(cnt2, stride); 1439 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 1440 1441 // We can read beyond string if srt+16 does not cross page boundary 1442 // since heaps are aligned and mapped by pages. 1443 assert(os::vm_page_size() < (int)G, "default page should be small"); 1444 movl(result, str2); // We need only low 32 bits 1445 andl(result, (os::vm_page_size()-1)); 1446 cmpl(result, (os::vm_page_size()-16)); 1447 jccb(Assembler::belowEqual, CHECK_STR); 1448 1449 // Move small strings to stack to allow load 16 bytes into vec. 1450 subptr(rsp, 16); 1451 int stk_offset = wordSize-(1<<scale2); 1452 push(cnt2); 1453 1454 bind(COPY_SUBSTR); 1455 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 1456 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 1457 movb(Address(rsp, cnt2, scale2, stk_offset), result); 1458 } else if (ae == StrIntrinsicNode::UU) { 1459 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 1460 movw(Address(rsp, cnt2, scale2, stk_offset), result); 1461 } 1462 decrement(cnt2); 1463 jccb(Assembler::notZero, COPY_SUBSTR); 1464 1465 pop(cnt2); 1466 movptr(str2, rsp); // New substring address 1467 } // non constant 1468 1469 bind(CHECK_STR); 1470 cmpl(cnt1, stride); 1471 jccb(Assembler::aboveEqual, BIG_STRINGS); 1472 1473 // Check cross page boundary. 1474 movl(result, str1); // We need only low 32 bits 1475 andl(result, (os::vm_page_size()-1)); 1476 cmpl(result, (os::vm_page_size()-16)); 1477 jccb(Assembler::belowEqual, BIG_STRINGS); 1478 1479 subptr(rsp, 16); 1480 int stk_offset = -(1<<scale1); 1481 if (int_cnt2 < 0) { // not constant 1482 push(cnt2); 1483 stk_offset += wordSize; 1484 } 1485 movl(cnt2, cnt1); 1486 1487 bind(COPY_STR); 1488 if (ae == StrIntrinsicNode::LL) { 1489 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 1490 movb(Address(rsp, cnt2, scale1, stk_offset), result); 1491 } else { 1492 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 1493 movw(Address(rsp, cnt2, scale1, stk_offset), result); 1494 } 1495 decrement(cnt2); 1496 jccb(Assembler::notZero, COPY_STR); 1497 1498 if (int_cnt2 < 0) { // not constant 1499 pop(cnt2); 1500 } 1501 movptr(str1, rsp); // New string address 1502 1503 bind(BIG_STRINGS); 1504 // Load substring. 1505 if (int_cnt2 < 0) { // -1 1506 if (ae == StrIntrinsicNode::UL) { 1507 pmovzxbw(vec, Address(str2, 0)); 1508 } else { 1509 movdqu(vec, Address(str2, 0)); 1510 } 1511 push(cnt2); // substr count 1512 push(str2); // substr addr 1513 push(str1); // string addr 1514 } else { 1515 // Small (< 8 chars) constant substrings are loaded already. 1516 movl(cnt2, int_cnt2); 1517 } 1518 push(tmp); // original SP 1519 1520 } // Finished loading 1521 1522 //======================================================== 1523 // Start search 1524 // 1525 1526 movptr(result, str1); // string addr 1527 1528 if (int_cnt2 < 0) { // Only for non constant substring 1529 jmpb(SCAN_TO_SUBSTR); 1530 1531 // SP saved at sp+0 1532 // String saved at sp+1*wordSize 1533 // Substr saved at sp+2*wordSize 1534 // Substr count saved at sp+3*wordSize 1535 1536 // Reload substr for rescan, this code 1537 // is executed only for large substrings (> 8 chars) 1538 bind(RELOAD_SUBSTR); 1539 movptr(str2, Address(rsp, 2*wordSize)); 1540 movl(cnt2, Address(rsp, 3*wordSize)); 1541 if (ae == StrIntrinsicNode::UL) { 1542 pmovzxbw(vec, Address(str2, 0)); 1543 } else { 1544 movdqu(vec, Address(str2, 0)); 1545 } 1546 // We came here after the beginning of the substring was 1547 // matched but the rest of it was not so we need to search 1548 // again. Start from the next element after the previous match. 1549 subptr(str1, result); // Restore counter 1550 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1551 shrl(str1, 1); 1552 } 1553 addl(cnt1, str1); 1554 decrementl(cnt1); // Shift to next element 1555 cmpl(cnt1, cnt2); 1556 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1557 1558 addptr(result, (1<<scale1)); 1559 } // non constant 1560 1561 // Scan string for start of substr in 16-byte vectors 1562 bind(SCAN_TO_SUBSTR); 1563 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1564 pcmpestri(vec, Address(result, 0), mode); 1565 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 1566 subl(cnt1, stride); 1567 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 1568 cmpl(cnt1, cnt2); 1569 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1570 addptr(result, 16); 1571 1572 bind(ADJUST_STR); 1573 cmpl(cnt1, stride); // Do not read beyond string 1574 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 1575 // Back-up string to avoid reading beyond string. 1576 lea(result, Address(result, cnt1, scale1, -16)); 1577 movl(cnt1, stride); 1578 jmpb(SCAN_TO_SUBSTR); 1579 1580 // Found a potential substr 1581 bind(FOUND_CANDIDATE); 1582 // After pcmpestri tmp(rcx) contains matched element index 1583 1584 // Make sure string is still long enough 1585 subl(cnt1, tmp); 1586 cmpl(cnt1, cnt2); 1587 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 1588 // Left less then substring. 1589 1590 bind(RET_NOT_FOUND); 1591 movl(result, -1); 1592 jmp(CLEANUP); 1593 1594 bind(FOUND_SUBSTR); 1595 // Compute start addr of substr 1596 lea(result, Address(result, tmp, scale1)); 1597 if (int_cnt2 > 0) { // Constant substring 1598 // Repeat search for small substring (< 8 chars) 1599 // from new point without reloading substring. 1600 // Have to check that we don't read beyond string. 1601 cmpl(tmp, stride-int_cnt2); 1602 jccb(Assembler::greater, ADJUST_STR); 1603 // Fall through if matched whole substring. 1604 } else { // non constant 1605 assert(int_cnt2 == -1, "should be != 0"); 1606 1607 addl(tmp, cnt2); 1608 // Found result if we matched whole substring. 1609 cmpl(tmp, stride); 1610 jcc(Assembler::lessEqual, RET_FOUND); 1611 1612 // Repeat search for small substring (<= 8 chars) 1613 // from new point 'str1' without reloading substring. 1614 cmpl(cnt2, stride); 1615 // Have to check that we don't read beyond string. 1616 jccb(Assembler::lessEqual, ADJUST_STR); 1617 1618 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 1619 // Compare the rest of substring (> 8 chars). 1620 movptr(str1, result); 1621 1622 cmpl(tmp, cnt2); 1623 // First 8 chars are already matched. 1624 jccb(Assembler::equal, CHECK_NEXT); 1625 1626 bind(SCAN_SUBSTR); 1627 pcmpestri(vec, Address(str1, 0), mode); 1628 // Need to reload strings pointers if not matched whole vector 1629 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 1630 1631 bind(CHECK_NEXT); 1632 subl(cnt2, stride); 1633 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 1634 addptr(str1, 16); 1635 if (ae == StrIntrinsicNode::UL) { 1636 addptr(str2, 8); 1637 } else { 1638 addptr(str2, 16); 1639 } 1640 subl(cnt1, stride); 1641 cmpl(cnt2, stride); // Do not read beyond substring 1642 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 1643 // Back-up strings to avoid reading beyond substring. 1644 1645 if (ae == StrIntrinsicNode::UL) { 1646 lea(str2, Address(str2, cnt2, scale2, -8)); 1647 lea(str1, Address(str1, cnt2, scale1, -16)); 1648 } else { 1649 lea(str2, Address(str2, cnt2, scale2, -16)); 1650 lea(str1, Address(str1, cnt2, scale1, -16)); 1651 } 1652 subl(cnt1, cnt2); 1653 movl(cnt2, stride); 1654 addl(cnt1, stride); 1655 bind(CONT_SCAN_SUBSTR); 1656 if (ae == StrIntrinsicNode::UL) { 1657 pmovzxbw(vec, Address(str2, 0)); 1658 } else { 1659 movdqu(vec, Address(str2, 0)); 1660 } 1661 jmp(SCAN_SUBSTR); 1662 1663 bind(RET_FOUND_LONG); 1664 movptr(str1, Address(rsp, wordSize)); 1665 } // non constant 1666 1667 bind(RET_FOUND); 1668 // Compute substr offset 1669 subptr(result, str1); 1670 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1671 shrl(result, 1); // index 1672 } 1673 bind(CLEANUP); 1674 pop(rsp); // restore SP 1675 1676 } // string_indexof 1677 1678 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 1679 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 1680 ShortBranchVerifier sbv(this); 1681 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1682 1683 int stride = 8; 1684 1685 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 1686 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 1687 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 1688 FOUND_SEQ_CHAR, DONE_LABEL; 1689 1690 movptr(result, str1); 1691 if (UseAVX >= 2) { 1692 cmpl(cnt1, stride); 1693 jcc(Assembler::less, SCAN_TO_CHAR); 1694 cmpl(cnt1, 2*stride); 1695 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 1696 movdl(vec1, ch); 1697 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 1698 vpxor(vec2, vec2); 1699 movl(tmp, cnt1); 1700 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 1701 andl(cnt1,0x0000000F); //tail count (in chars) 1702 1703 bind(SCAN_TO_16_CHAR_LOOP); 1704 vmovdqu(vec3, Address(result, 0)); 1705 vpcmpeqw(vec3, vec3, vec1, 1); 1706 vptest(vec2, vec3); 1707 jcc(Assembler::carryClear, FOUND_CHAR); 1708 addptr(result, 32); 1709 subl(tmp, 2*stride); 1710 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 1711 jmp(SCAN_TO_8_CHAR); 1712 bind(SCAN_TO_8_CHAR_INIT); 1713 movdl(vec1, ch); 1714 pshuflw(vec1, vec1, 0x00); 1715 pshufd(vec1, vec1, 0); 1716 pxor(vec2, vec2); 1717 } 1718 bind(SCAN_TO_8_CHAR); 1719 cmpl(cnt1, stride); 1720 jcc(Assembler::less, SCAN_TO_CHAR); 1721 if (UseAVX < 2) { 1722 movdl(vec1, ch); 1723 pshuflw(vec1, vec1, 0x00); 1724 pshufd(vec1, vec1, 0); 1725 pxor(vec2, vec2); 1726 } 1727 movl(tmp, cnt1); 1728 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 1729 andl(cnt1,0x00000007); //tail count (in chars) 1730 1731 bind(SCAN_TO_8_CHAR_LOOP); 1732 movdqu(vec3, Address(result, 0)); 1733 pcmpeqw(vec3, vec1); 1734 ptest(vec2, vec3); 1735 jcc(Assembler::carryClear, FOUND_CHAR); 1736 addptr(result, 16); 1737 subl(tmp, stride); 1738 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 1739 bind(SCAN_TO_CHAR); 1740 testl(cnt1, cnt1); 1741 jcc(Assembler::zero, RET_NOT_FOUND); 1742 bind(SCAN_TO_CHAR_LOOP); 1743 load_unsigned_short(tmp, Address(result, 0)); 1744 cmpl(ch, tmp); 1745 jccb(Assembler::equal, FOUND_SEQ_CHAR); 1746 addptr(result, 2); 1747 subl(cnt1, 1); 1748 jccb(Assembler::zero, RET_NOT_FOUND); 1749 jmp(SCAN_TO_CHAR_LOOP); 1750 1751 bind(RET_NOT_FOUND); 1752 movl(result, -1); 1753 jmpb(DONE_LABEL); 1754 1755 bind(FOUND_CHAR); 1756 if (UseAVX >= 2) { 1757 vpmovmskb(tmp, vec3); 1758 } else { 1759 pmovmskb(tmp, vec3); 1760 } 1761 bsfl(ch, tmp); 1762 addl(result, ch); 1763 1764 bind(FOUND_SEQ_CHAR); 1765 subptr(result, str1); 1766 shrl(result, 1); 1767 1768 bind(DONE_LABEL); 1769 } // string_indexof_char 1770 1771 // helper function for string_compare 1772 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 1773 Address::ScaleFactor scale, Address::ScaleFactor scale1, 1774 Address::ScaleFactor scale2, Register index, int ae) { 1775 if (ae == StrIntrinsicNode::LL) { 1776 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 1777 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 1778 } else if (ae == StrIntrinsicNode::UU) { 1779 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 1780 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 1781 } else { 1782 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 1783 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 1784 } 1785 } 1786 1787 // Compare strings, used for char[] and byte[]. 1788 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1789 Register cnt1, Register cnt2, Register result, 1790 XMMRegister vec1, int ae) { 1791 ShortBranchVerifier sbv(this); 1792 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 1793 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 1794 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 1795 int stride2x2 = 0x40; 1796 Address::ScaleFactor scale = Address::no_scale; 1797 Address::ScaleFactor scale1 = Address::no_scale; 1798 Address::ScaleFactor scale2 = Address::no_scale; 1799 1800 if (ae != StrIntrinsicNode::LL) { 1801 stride2x2 = 0x20; 1802 } 1803 1804 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 1805 shrl(cnt2, 1); 1806 } 1807 // Compute the minimum of the string lengths and the 1808 // difference of the string lengths (stack). 1809 // Do the conditional move stuff 1810 movl(result, cnt1); 1811 subl(cnt1, cnt2); 1812 push(cnt1); 1813 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 1814 1815 // Is the minimum length zero? 1816 testl(cnt2, cnt2); 1817 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 1818 if (ae == StrIntrinsicNode::LL) { 1819 // Load first bytes 1820 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 1821 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 1822 } else if (ae == StrIntrinsicNode::UU) { 1823 // Load first characters 1824 load_unsigned_short(result, Address(str1, 0)); 1825 load_unsigned_short(cnt1, Address(str2, 0)); 1826 } else { 1827 load_unsigned_byte(result, Address(str1, 0)); 1828 load_unsigned_short(cnt1, Address(str2, 0)); 1829 } 1830 subl(result, cnt1); 1831 jcc(Assembler::notZero, POP_LABEL); 1832 1833 if (ae == StrIntrinsicNode::UU) { 1834 // Divide length by 2 to get number of chars 1835 shrl(cnt2, 1); 1836 } 1837 cmpl(cnt2, 1); 1838 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 1839 1840 // Check if the strings start at the same location and setup scale and stride 1841 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1842 cmpptr(str1, str2); 1843 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 1844 if (ae == StrIntrinsicNode::LL) { 1845 scale = Address::times_1; 1846 stride = 16; 1847 } else { 1848 scale = Address::times_2; 1849 stride = 8; 1850 } 1851 } else { 1852 scale1 = Address::times_1; 1853 scale2 = Address::times_2; 1854 // scale not used 1855 stride = 8; 1856 } 1857 1858 if (UseAVX >= 2 && UseSSE42Intrinsics) { 1859 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 1860 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 1861 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 1862 Label COMPARE_TAIL_LONG; 1863 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 1864 1865 int pcmpmask = 0x19; 1866 if (ae == StrIntrinsicNode::LL) { 1867 pcmpmask &= ~0x01; 1868 } 1869 1870 // Setup to compare 16-chars (32-bytes) vectors, 1871 // start from first character again because it has aligned address. 1872 if (ae == StrIntrinsicNode::LL) { 1873 stride2 = 32; 1874 } else { 1875 stride2 = 16; 1876 } 1877 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1878 adr_stride = stride << scale; 1879 } else { 1880 adr_stride1 = 8; //stride << scale1; 1881 adr_stride2 = 16; //stride << scale2; 1882 } 1883 1884 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 1885 // rax and rdx are used by pcmpestri as elements counters 1886 movl(result, cnt2); 1887 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 1888 jcc(Assembler::zero, COMPARE_TAIL_LONG); 1889 1890 // fast path : compare first 2 8-char vectors. 1891 bind(COMPARE_16_CHARS); 1892 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1893 movdqu(vec1, Address(str1, 0)); 1894 } else { 1895 pmovzxbw(vec1, Address(str1, 0)); 1896 } 1897 pcmpestri(vec1, Address(str2, 0), pcmpmask); 1898 jccb(Assembler::below, COMPARE_INDEX_CHAR); 1899 1900 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1901 movdqu(vec1, Address(str1, adr_stride)); 1902 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 1903 } else { 1904 pmovzxbw(vec1, Address(str1, adr_stride1)); 1905 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 1906 } 1907 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 1908 addl(cnt1, stride); 1909 1910 // Compare the characters at index in cnt1 1911 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 1912 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 1913 subl(result, cnt2); 1914 jmp(POP_LABEL); 1915 1916 // Setup the registers to start vector comparison loop 1917 bind(COMPARE_WIDE_VECTORS); 1918 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1919 lea(str1, Address(str1, result, scale)); 1920 lea(str2, Address(str2, result, scale)); 1921 } else { 1922 lea(str1, Address(str1, result, scale1)); 1923 lea(str2, Address(str2, result, scale2)); 1924 } 1925 subl(result, stride2); 1926 subl(cnt2, stride2); 1927 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 1928 negptr(result); 1929 1930 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 1931 bind(COMPARE_WIDE_VECTORS_LOOP); 1932 1933 #ifdef _LP64 1934 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 1935 cmpl(cnt2, stride2x2); 1936 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 1937 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 1938 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 1939 1940 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 1941 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1942 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 1943 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 1944 } else { 1945 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 1946 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 1947 } 1948 kortestql(k7, k7); 1949 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 1950 addptr(result, stride2x2); // update since we already compared at this addr 1951 subl(cnt2, stride2x2); // and sub the size too 1952 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 1953 1954 vpxor(vec1, vec1); 1955 jmpb(COMPARE_WIDE_TAIL); 1956 }//if (VM_Version::supports_avx512vlbw()) 1957 #endif // _LP64 1958 1959 1960 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 1961 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1962 vmovdqu(vec1, Address(str1, result, scale)); 1963 vpxor(vec1, Address(str2, result, scale)); 1964 } else { 1965 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 1966 vpxor(vec1, Address(str2, result, scale2)); 1967 } 1968 vptest(vec1, vec1); 1969 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 1970 addptr(result, stride2); 1971 subl(cnt2, stride2); 1972 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 1973 // clean upper bits of YMM registers 1974 vpxor(vec1, vec1); 1975 1976 // compare wide vectors tail 1977 bind(COMPARE_WIDE_TAIL); 1978 testptr(result, result); 1979 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 1980 1981 movl(result, stride2); 1982 movl(cnt2, result); 1983 negptr(result); 1984 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 1985 1986 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 1987 bind(VECTOR_NOT_EQUAL); 1988 // clean upper bits of YMM registers 1989 vpxor(vec1, vec1); 1990 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1991 lea(str1, Address(str1, result, scale)); 1992 lea(str2, Address(str2, result, scale)); 1993 } else { 1994 lea(str1, Address(str1, result, scale1)); 1995 lea(str2, Address(str2, result, scale2)); 1996 } 1997 jmp(COMPARE_16_CHARS); 1998 1999 // Compare tail chars, length between 1 to 15 chars 2000 bind(COMPARE_TAIL_LONG); 2001 movl(cnt2, result); 2002 cmpl(cnt2, stride); 2003 jcc(Assembler::less, COMPARE_SMALL_STR); 2004 2005 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2006 movdqu(vec1, Address(str1, 0)); 2007 } else { 2008 pmovzxbw(vec1, Address(str1, 0)); 2009 } 2010 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2011 jcc(Assembler::below, COMPARE_INDEX_CHAR); 2012 subptr(cnt2, stride); 2013 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2014 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2015 lea(str1, Address(str1, result, scale)); 2016 lea(str2, Address(str2, result, scale)); 2017 } else { 2018 lea(str1, Address(str1, result, scale1)); 2019 lea(str2, Address(str2, result, scale2)); 2020 } 2021 negptr(cnt2); 2022 jmpb(WHILE_HEAD_LABEL); 2023 2024 bind(COMPARE_SMALL_STR); 2025 } else if (UseSSE42Intrinsics) { 2026 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 2027 int pcmpmask = 0x19; 2028 // Setup to compare 8-char (16-byte) vectors, 2029 // start from first character again because it has aligned address. 2030 movl(result, cnt2); 2031 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 2032 if (ae == StrIntrinsicNode::LL) { 2033 pcmpmask &= ~0x01; 2034 } 2035 jcc(Assembler::zero, COMPARE_TAIL); 2036 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2037 lea(str1, Address(str1, result, scale)); 2038 lea(str2, Address(str2, result, scale)); 2039 } else { 2040 lea(str1, Address(str1, result, scale1)); 2041 lea(str2, Address(str2, result, scale2)); 2042 } 2043 negptr(result); 2044 2045 // pcmpestri 2046 // inputs: 2047 // vec1- substring 2048 // rax - negative string length (elements count) 2049 // mem - scanned string 2050 // rdx - string length (elements count) 2051 // pcmpmask - cmp mode: 11000 (string compare with negated result) 2052 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2053 // outputs: 2054 // rcx - first mismatched element index 2055 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2056 2057 bind(COMPARE_WIDE_VECTORS); 2058 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2059 movdqu(vec1, Address(str1, result, scale)); 2060 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2061 } else { 2062 pmovzxbw(vec1, Address(str1, result, scale1)); 2063 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2064 } 2065 // After pcmpestri cnt1(rcx) contains mismatched element index 2066 2067 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 2068 addptr(result, stride); 2069 subptr(cnt2, stride); 2070 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 2071 2072 // compare wide vectors tail 2073 testptr(result, result); 2074 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2075 2076 movl(cnt2, stride); 2077 movl(result, stride); 2078 negptr(result); 2079 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2080 movdqu(vec1, Address(str1, result, scale)); 2081 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2082 } else { 2083 pmovzxbw(vec1, Address(str1, result, scale1)); 2084 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2085 } 2086 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 2087 2088 // Mismatched characters in the vectors 2089 bind(VECTOR_NOT_EQUAL); 2090 addptr(cnt1, result); 2091 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2092 subl(result, cnt2); 2093 jmpb(POP_LABEL); 2094 2095 bind(COMPARE_TAIL); // limit is zero 2096 movl(cnt2, result); 2097 // Fallthru to tail compare 2098 } 2099 // Shift str2 and str1 to the end of the arrays, negate min 2100 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2101 lea(str1, Address(str1, cnt2, scale)); 2102 lea(str2, Address(str2, cnt2, scale)); 2103 } else { 2104 lea(str1, Address(str1, cnt2, scale1)); 2105 lea(str2, Address(str2, cnt2, scale2)); 2106 } 2107 decrementl(cnt2); // first character was compared already 2108 negptr(cnt2); 2109 2110 // Compare the rest of the elements 2111 bind(WHILE_HEAD_LABEL); 2112 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 2113 subl(result, cnt1); 2114 jccb(Assembler::notZero, POP_LABEL); 2115 increment(cnt2); 2116 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 2117 2118 // Strings are equal up to min length. Return the length difference. 2119 bind(LENGTH_DIFF_LABEL); 2120 pop(result); 2121 if (ae == StrIntrinsicNode::UU) { 2122 // Divide diff by 2 to get number of chars 2123 sarl(result, 1); 2124 } 2125 jmpb(DONE_LABEL); 2126 2127 #ifdef _LP64 2128 if (VM_Version::supports_avx512vlbw()) { 2129 2130 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 2131 2132 kmovql(cnt1, k7); 2133 notq(cnt1); 2134 bsfq(cnt2, cnt1); 2135 if (ae != StrIntrinsicNode::LL) { 2136 // Divide diff by 2 to get number of chars 2137 sarl(cnt2, 1); 2138 } 2139 addq(result, cnt2); 2140 if (ae == StrIntrinsicNode::LL) { 2141 load_unsigned_byte(cnt1, Address(str2, result)); 2142 load_unsigned_byte(result, Address(str1, result)); 2143 } else if (ae == StrIntrinsicNode::UU) { 2144 load_unsigned_short(cnt1, Address(str2, result, scale)); 2145 load_unsigned_short(result, Address(str1, result, scale)); 2146 } else { 2147 load_unsigned_short(cnt1, Address(str2, result, scale2)); 2148 load_unsigned_byte(result, Address(str1, result, scale1)); 2149 } 2150 subl(result, cnt1); 2151 jmpb(POP_LABEL); 2152 }//if (VM_Version::supports_avx512vlbw()) 2153 #endif // _LP64 2154 2155 // Discard the stored length difference 2156 bind(POP_LABEL); 2157 pop(cnt1); 2158 2159 // That's it 2160 bind(DONE_LABEL); 2161 if(ae == StrIntrinsicNode::UL) { 2162 negl(result); 2163 } 2164 2165 } 2166 2167 // Search for Non-ASCII character (Negative byte value) in a byte array, 2168 // return true if it has any and false otherwise. 2169 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 2170 // @HotSpotIntrinsicCandidate 2171 // private static boolean hasNegatives(byte[] ba, int off, int len) { 2172 // for (int i = off; i < off + len; i++) { 2173 // if (ba[i] < 0) { 2174 // return true; 2175 // } 2176 // } 2177 // return false; 2178 // } 2179 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 2180 Register result, Register tmp1, 2181 XMMRegister vec1, XMMRegister vec2) { 2182 // rsi: byte array 2183 // rcx: len 2184 // rax: result 2185 ShortBranchVerifier sbv(this); 2186 assert_different_registers(ary1, len, result, tmp1); 2187 assert_different_registers(vec1, vec2); 2188 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 2189 2190 // len == 0 2191 testl(len, len); 2192 jcc(Assembler::zero, FALSE_LABEL); 2193 2194 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 2195 VM_Version::supports_avx512vlbw() && 2196 VM_Version::supports_bmi2()) { 2197 2198 Label test_64_loop, test_tail; 2199 Register tmp3_aliased = len; 2200 2201 movl(tmp1, len); 2202 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 2203 2204 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 2205 andl(len, ~(64 - 1)); // vector count (in chars) 2206 jccb(Assembler::zero, test_tail); 2207 2208 lea(ary1, Address(ary1, len, Address::times_1)); 2209 negptr(len); 2210 2211 bind(test_64_loop); 2212 // Check whether our 64 elements of size byte contain negatives 2213 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 2214 kortestql(k2, k2); 2215 jcc(Assembler::notZero, TRUE_LABEL); 2216 2217 addptr(len, 64); 2218 jccb(Assembler::notZero, test_64_loop); 2219 2220 2221 bind(test_tail); 2222 // bail out when there is nothing to be done 2223 testl(tmp1, -1); 2224 jcc(Assembler::zero, FALSE_LABEL); 2225 2226 // ~(~0 << len) applied up to two times (for 32-bit scenario) 2227 #ifdef _LP64 2228 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 2229 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 2230 notq(tmp3_aliased); 2231 kmovql(k3, tmp3_aliased); 2232 #else 2233 Label k_init; 2234 jmp(k_init); 2235 2236 // We could not read 64-bits from a general purpose register thus we move 2237 // data required to compose 64 1's to the instruction stream 2238 // We emit 64 byte wide series of elements from 0..63 which later on would 2239 // be used as a compare targets with tail count contained in tmp1 register. 2240 // Result would be a k register having tmp1 consecutive number or 1 2241 // counting from least significant bit. 2242 address tmp = pc(); 2243 emit_int64(0x0706050403020100); 2244 emit_int64(0x0F0E0D0C0B0A0908); 2245 emit_int64(0x1716151413121110); 2246 emit_int64(0x1F1E1D1C1B1A1918); 2247 emit_int64(0x2726252423222120); 2248 emit_int64(0x2F2E2D2C2B2A2928); 2249 emit_int64(0x3736353433323130); 2250 emit_int64(0x3F3E3D3C3B3A3938); 2251 2252 bind(k_init); 2253 lea(len, InternalAddress(tmp)); 2254 // create mask to test for negative byte inside a vector 2255 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 2256 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); 2257 2258 #endif 2259 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); 2260 ktestq(k2, k3); 2261 jcc(Assembler::notZero, TRUE_LABEL); 2262 2263 jmp(FALSE_LABEL); 2264 } else { 2265 movl(result, len); // copy 2266 2267 if (UseAVX >= 2 && UseSSE >= 2) { 2268 // With AVX2, use 32-byte vector compare 2269 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2270 2271 // Compare 32-byte vectors 2272 andl(result, 0x0000001f); // tail count (in bytes) 2273 andl(len, 0xffffffe0); // vector count (in bytes) 2274 jccb(Assembler::zero, COMPARE_TAIL); 2275 2276 lea(ary1, Address(ary1, len, Address::times_1)); 2277 negptr(len); 2278 2279 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 2280 movdl(vec2, tmp1); 2281 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 2282 2283 bind(COMPARE_WIDE_VECTORS); 2284 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 2285 vptest(vec1, vec2); 2286 jccb(Assembler::notZero, TRUE_LABEL); 2287 addptr(len, 32); 2288 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2289 2290 testl(result, result); 2291 jccb(Assembler::zero, FALSE_LABEL); 2292 2293 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 2294 vptest(vec1, vec2); 2295 jccb(Assembler::notZero, TRUE_LABEL); 2296 jmpb(FALSE_LABEL); 2297 2298 bind(COMPARE_TAIL); // len is zero 2299 movl(len, result); 2300 // Fallthru to tail compare 2301 } else if (UseSSE42Intrinsics) { 2302 // With SSE4.2, use double quad vector compare 2303 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2304 2305 // Compare 16-byte vectors 2306 andl(result, 0x0000000f); // tail count (in bytes) 2307 andl(len, 0xfffffff0); // vector count (in bytes) 2308 jcc(Assembler::zero, COMPARE_TAIL); 2309 2310 lea(ary1, Address(ary1, len, Address::times_1)); 2311 negptr(len); 2312 2313 movl(tmp1, 0x80808080); 2314 movdl(vec2, tmp1); 2315 pshufd(vec2, vec2, 0); 2316 2317 bind(COMPARE_WIDE_VECTORS); 2318 movdqu(vec1, Address(ary1, len, Address::times_1)); 2319 ptest(vec1, vec2); 2320 jcc(Assembler::notZero, TRUE_LABEL); 2321 addptr(len, 16); 2322 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2323 2324 testl(result, result); 2325 jcc(Assembler::zero, FALSE_LABEL); 2326 2327 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 2328 ptest(vec1, vec2); 2329 jccb(Assembler::notZero, TRUE_LABEL); 2330 jmpb(FALSE_LABEL); 2331 2332 bind(COMPARE_TAIL); // len is zero 2333 movl(len, result); 2334 // Fallthru to tail compare 2335 } 2336 } 2337 // Compare 4-byte vectors 2338 andl(len, 0xfffffffc); // vector count (in bytes) 2339 jccb(Assembler::zero, COMPARE_CHAR); 2340 2341 lea(ary1, Address(ary1, len, Address::times_1)); 2342 negptr(len); 2343 2344 bind(COMPARE_VECTORS); 2345 movl(tmp1, Address(ary1, len, Address::times_1)); 2346 andl(tmp1, 0x80808080); 2347 jccb(Assembler::notZero, TRUE_LABEL); 2348 addptr(len, 4); 2349 jcc(Assembler::notZero, COMPARE_VECTORS); 2350 2351 // Compare trailing char (final 2 bytes), if any 2352 bind(COMPARE_CHAR); 2353 testl(result, 0x2); // tail char 2354 jccb(Assembler::zero, COMPARE_BYTE); 2355 load_unsigned_short(tmp1, Address(ary1, 0)); 2356 andl(tmp1, 0x00008080); 2357 jccb(Assembler::notZero, TRUE_LABEL); 2358 subptr(result, 2); 2359 lea(ary1, Address(ary1, 2)); 2360 2361 bind(COMPARE_BYTE); 2362 testl(result, 0x1); // tail byte 2363 jccb(Assembler::zero, FALSE_LABEL); 2364 load_unsigned_byte(tmp1, Address(ary1, 0)); 2365 andl(tmp1, 0x00000080); 2366 jccb(Assembler::notEqual, TRUE_LABEL); 2367 jmpb(FALSE_LABEL); 2368 2369 bind(TRUE_LABEL); 2370 movl(result, 1); // return true 2371 jmpb(DONE); 2372 2373 bind(FALSE_LABEL); 2374 xorl(result, result); // return false 2375 2376 // That's it 2377 bind(DONE); 2378 if (UseAVX >= 2 && UseSSE >= 2) { 2379 // clean upper bits of YMM registers 2380 vpxor(vec1, vec1); 2381 vpxor(vec2, vec2); 2382 } 2383 } 2384 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 2385 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 2386 Register limit, Register result, Register chr, 2387 XMMRegister vec1, XMMRegister vec2, bool is_char) { 2388 ShortBranchVerifier sbv(this); 2389 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 2390 2391 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2392 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 2393 2394 if (is_array_equ) { 2395 // Check the input args 2396 cmpoop(ary1, ary2); 2397 jcc(Assembler::equal, TRUE_LABEL); 2398 2399 // Need additional checks for arrays_equals. 2400 testptr(ary1, ary1); 2401 jcc(Assembler::zero, FALSE_LABEL); 2402 testptr(ary2, ary2); 2403 jcc(Assembler::zero, FALSE_LABEL); 2404 2405 // Check the lengths 2406 movl(limit, Address(ary1, length_offset)); 2407 cmpl(limit, Address(ary2, length_offset)); 2408 jcc(Assembler::notEqual, FALSE_LABEL); 2409 } 2410 2411 // count == 0 2412 testl(limit, limit); 2413 jcc(Assembler::zero, TRUE_LABEL); 2414 2415 if (is_array_equ) { 2416 // Load array address 2417 lea(ary1, Address(ary1, base_offset)); 2418 lea(ary2, Address(ary2, base_offset)); 2419 } 2420 2421 if (is_array_equ && is_char) { 2422 // arrays_equals when used for char[]. 2423 shll(limit, 1); // byte count != 0 2424 } 2425 movl(result, limit); // copy 2426 2427 if (UseAVX >= 2) { 2428 // With AVX2, use 32-byte vector compare 2429 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2430 2431 // Compare 32-byte vectors 2432 andl(result, 0x0000001f); // tail count (in bytes) 2433 andl(limit, 0xffffffe0); // vector count (in bytes) 2434 jcc(Assembler::zero, COMPARE_TAIL); 2435 2436 lea(ary1, Address(ary1, limit, Address::times_1)); 2437 lea(ary2, Address(ary2, limit, Address::times_1)); 2438 negptr(limit); 2439 2440 #ifdef _LP64 2441 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 2442 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 2443 2444 cmpl(limit, -64); 2445 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 2446 2447 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 2448 2449 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 2450 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 2451 kortestql(k7, k7); 2452 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 2453 addptr(limit, 64); // update since we already compared at this addr 2454 cmpl(limit, -64); 2455 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 2456 2457 // At this point we may still need to compare -limit+result bytes. 2458 // We could execute the next two instruction and just continue via non-wide path: 2459 // cmpl(limit, 0); 2460 // jcc(Assembler::equal, COMPARE_TAIL); // true 2461 // But since we stopped at the points ary{1,2}+limit which are 2462 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 2463 // (|limit| <= 32 and result < 32), 2464 // we may just compare the last 64 bytes. 2465 // 2466 addptr(result, -64); // it is safe, bc we just came from this area 2467 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 2468 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 2469 kortestql(k7, k7); 2470 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 2471 2472 jmp(TRUE_LABEL); 2473 2474 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2475 2476 }//if (VM_Version::supports_avx512vlbw()) 2477 #endif //_LP64 2478 bind(COMPARE_WIDE_VECTORS); 2479 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 2480 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 2481 vpxor(vec1, vec2); 2482 2483 vptest(vec1, vec1); 2484 jcc(Assembler::notZero, FALSE_LABEL); 2485 addptr(limit, 32); 2486 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2487 2488 testl(result, result); 2489 jcc(Assembler::zero, TRUE_LABEL); 2490 2491 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 2492 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 2493 vpxor(vec1, vec2); 2494 2495 vptest(vec1, vec1); 2496 jccb(Assembler::notZero, FALSE_LABEL); 2497 jmpb(TRUE_LABEL); 2498 2499 bind(COMPARE_TAIL); // limit is zero 2500 movl(limit, result); 2501 // Fallthru to tail compare 2502 } else if (UseSSE42Intrinsics) { 2503 // With SSE4.2, use double quad vector compare 2504 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2505 2506 // Compare 16-byte vectors 2507 andl(result, 0x0000000f); // tail count (in bytes) 2508 andl(limit, 0xfffffff0); // vector count (in bytes) 2509 jcc(Assembler::zero, COMPARE_TAIL); 2510 2511 lea(ary1, Address(ary1, limit, Address::times_1)); 2512 lea(ary2, Address(ary2, limit, Address::times_1)); 2513 negptr(limit); 2514 2515 bind(COMPARE_WIDE_VECTORS); 2516 movdqu(vec1, Address(ary1, limit, Address::times_1)); 2517 movdqu(vec2, Address(ary2, limit, Address::times_1)); 2518 pxor(vec1, vec2); 2519 2520 ptest(vec1, vec1); 2521 jcc(Assembler::notZero, FALSE_LABEL); 2522 addptr(limit, 16); 2523 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2524 2525 testl(result, result); 2526 jcc(Assembler::zero, TRUE_LABEL); 2527 2528 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 2529 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 2530 pxor(vec1, vec2); 2531 2532 ptest(vec1, vec1); 2533 jccb(Assembler::notZero, FALSE_LABEL); 2534 jmpb(TRUE_LABEL); 2535 2536 bind(COMPARE_TAIL); // limit is zero 2537 movl(limit, result); 2538 // Fallthru to tail compare 2539 } 2540 2541 // Compare 4-byte vectors 2542 andl(limit, 0xfffffffc); // vector count (in bytes) 2543 jccb(Assembler::zero, COMPARE_CHAR); 2544 2545 lea(ary1, Address(ary1, limit, Address::times_1)); 2546 lea(ary2, Address(ary2, limit, Address::times_1)); 2547 negptr(limit); 2548 2549 bind(COMPARE_VECTORS); 2550 movl(chr, Address(ary1, limit, Address::times_1)); 2551 cmpl(chr, Address(ary2, limit, Address::times_1)); 2552 jccb(Assembler::notEqual, FALSE_LABEL); 2553 addptr(limit, 4); 2554 jcc(Assembler::notZero, COMPARE_VECTORS); 2555 2556 // Compare trailing char (final 2 bytes), if any 2557 bind(COMPARE_CHAR); 2558 testl(result, 0x2); // tail char 2559 jccb(Assembler::zero, COMPARE_BYTE); 2560 load_unsigned_short(chr, Address(ary1, 0)); 2561 load_unsigned_short(limit, Address(ary2, 0)); 2562 cmpl(chr, limit); 2563 jccb(Assembler::notEqual, FALSE_LABEL); 2564 2565 if (is_array_equ && is_char) { 2566 bind(COMPARE_BYTE); 2567 } else { 2568 lea(ary1, Address(ary1, 2)); 2569 lea(ary2, Address(ary2, 2)); 2570 2571 bind(COMPARE_BYTE); 2572 testl(result, 0x1); // tail byte 2573 jccb(Assembler::zero, TRUE_LABEL); 2574 load_unsigned_byte(chr, Address(ary1, 0)); 2575 load_unsigned_byte(limit, Address(ary2, 0)); 2576 cmpl(chr, limit); 2577 jccb(Assembler::notEqual, FALSE_LABEL); 2578 } 2579 bind(TRUE_LABEL); 2580 movl(result, 1); // return true 2581 jmpb(DONE); 2582 2583 bind(FALSE_LABEL); 2584 xorl(result, result); // return false 2585 2586 // That's it 2587 bind(DONE); 2588 if (UseAVX >= 2) { 2589 // clean upper bits of YMM registers 2590 vpxor(vec1, vec1); 2591 vpxor(vec2, vec2); 2592 } 2593 }