1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "runtime/biasedLocking.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 void C2_MacroAssembler::setvectmask(Register dst, Register src) { 37 guarantee(PostLoopMultiversioning, "must be"); 38 Assembler::movl(dst, 1); 39 Assembler::shlxl(dst, dst, src); 40 Assembler::decl(dst); 41 Assembler::kmovdl(k1, dst); 42 Assembler::movl(dst, src); 43 } 44 45 void C2_MacroAssembler::restorevectmask() { 46 guarantee(PostLoopMultiversioning, "must be"); 47 Assembler::knotwl(k1, k0); 48 } 49 50 #if INCLUDE_RTM_OPT 51 52 // Update rtm_counters based on abort status 53 // input: abort_status 54 // rtm_counters (RTMLockingCounters*) 55 // flags are killed 56 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 57 58 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 59 if (PrintPreciseRTMLockingStatistics) { 60 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 61 Label check_abort; 62 testl(abort_status, (1<<i)); 63 jccb(Assembler::equal, check_abort); 64 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 65 bind(check_abort); 66 } 67 } 68 } 69 70 // Branch if (random & (count-1) != 0), count is 2^n 71 // tmp, scr and flags are killed 72 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 73 assert(tmp == rax, ""); 74 assert(scr == rdx, ""); 75 rdtsc(); // modifies EDX:EAX 76 andptr(tmp, count-1); 77 jccb(Assembler::notZero, brLabel); 78 } 79 80 // Perform abort ratio calculation, set no_rtm bit if high ratio 81 // input: rtm_counters_Reg (RTMLockingCounters* address) 82 // tmpReg, rtm_counters_Reg and flags are killed 83 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 84 Register rtm_counters_Reg, 85 RTMLockingCounters* rtm_counters, 86 Metadata* method_data) { 87 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 88 89 if (RTMLockingCalculationDelay > 0) { 90 // Delay calculation 91 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 92 testptr(tmpReg, tmpReg); 93 jccb(Assembler::equal, L_done); 94 } 95 // Abort ratio calculation only if abort_count > RTMAbortThreshold 96 // Aborted transactions = abort_count * 100 97 // All transactions = total_count * RTMTotalCountIncrRate 98 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 99 100 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 101 cmpptr(tmpReg, RTMAbortThreshold); 102 jccb(Assembler::below, L_check_always_rtm2); 103 imulptr(tmpReg, tmpReg, 100); 104 105 Register scrReg = rtm_counters_Reg; 106 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 107 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 108 imulptr(scrReg, scrReg, RTMAbortRatio); 109 cmpptr(tmpReg, scrReg); 110 jccb(Assembler::below, L_check_always_rtm1); 111 if (method_data != NULL) { 112 // set rtm_state to "no rtm" in MDO 113 mov_metadata(tmpReg, method_data); 114 lock(); 115 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 116 } 117 jmpb(L_done); 118 bind(L_check_always_rtm1); 119 // Reload RTMLockingCounters* address 120 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 121 bind(L_check_always_rtm2); 122 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 123 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 124 jccb(Assembler::below, L_done); 125 if (method_data != NULL) { 126 // set rtm_state to "always rtm" in MDO 127 mov_metadata(tmpReg, method_data); 128 lock(); 129 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 130 } 131 bind(L_done); 132 } 133 134 // Update counters and perform abort ratio calculation 135 // input: abort_status_Reg 136 // rtm_counters_Reg, flags are killed 137 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 138 Register rtm_counters_Reg, 139 RTMLockingCounters* rtm_counters, 140 Metadata* method_data, 141 bool profile_rtm) { 142 143 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 144 // update rtm counters based on rax value at abort 145 // reads abort_status_Reg, updates flags 146 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 147 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 148 if (profile_rtm) { 149 // Save abort status because abort_status_Reg is used by following code. 150 if (RTMRetryCount > 0) { 151 push(abort_status_Reg); 152 } 153 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 154 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 155 // restore abort status 156 if (RTMRetryCount > 0) { 157 pop(abort_status_Reg); 158 } 159 } 160 } 161 162 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 163 // inputs: retry_count_Reg 164 // : abort_status_Reg 165 // output: retry_count_Reg decremented by 1 166 // flags are killed 167 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 168 Label doneRetry; 169 assert(abort_status_Reg == rax, ""); 170 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 171 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 172 // if reason is in 0x6 and retry count != 0 then retry 173 andptr(abort_status_Reg, 0x6); 174 jccb(Assembler::zero, doneRetry); 175 testl(retry_count_Reg, retry_count_Reg); 176 jccb(Assembler::zero, doneRetry); 177 pause(); 178 decrementl(retry_count_Reg); 179 jmp(retryLabel); 180 bind(doneRetry); 181 } 182 183 // Spin and retry if lock is busy, 184 // inputs: box_Reg (monitor address) 185 // : retry_count_Reg 186 // output: retry_count_Reg decremented by 1 187 // : clear z flag if retry count exceeded 188 // tmp_Reg, scr_Reg, flags are killed 189 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 190 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 191 Label SpinLoop, SpinExit, doneRetry; 192 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 193 194 testl(retry_count_Reg, retry_count_Reg); 195 jccb(Assembler::zero, doneRetry); 196 decrementl(retry_count_Reg); 197 movptr(scr_Reg, RTMSpinLoopCount); 198 199 bind(SpinLoop); 200 pause(); 201 decrementl(scr_Reg); 202 jccb(Assembler::lessEqual, SpinExit); 203 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 204 testptr(tmp_Reg, tmp_Reg); 205 jccb(Assembler::notZero, SpinLoop); 206 207 bind(SpinExit); 208 jmp(retryLabel); 209 bind(doneRetry); 210 incrementl(retry_count_Reg); // clear z flag 211 } 212 213 // Use RTM for normal stack locks 214 // Input: objReg (object to lock) 215 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 216 Register retry_on_abort_count_Reg, 217 RTMLockingCounters* stack_rtm_counters, 218 Metadata* method_data, bool profile_rtm, 219 Label& DONE_LABEL, Label& IsInflated) { 220 assert(UseRTMForStackLocks, "why call this otherwise?"); 221 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 222 assert(tmpReg == rax, ""); 223 assert(scrReg == rdx, ""); 224 Label L_rtm_retry, L_decrement_retry, L_on_abort; 225 226 if (RTMRetryCount > 0) { 227 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 228 bind(L_rtm_retry); 229 } 230 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 231 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 232 jcc(Assembler::notZero, IsInflated); 233 234 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 235 Label L_noincrement; 236 if (RTMTotalCountIncrRate > 1) { 237 // tmpReg, scrReg and flags are killed 238 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 239 } 240 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 241 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 242 bind(L_noincrement); 243 } 244 xbegin(L_on_abort); 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 246 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 247 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 248 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 249 250 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 251 if (UseRTMXendForLockBusy) { 252 xend(); 253 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 254 jmp(L_decrement_retry); 255 } 256 else { 257 xabort(0); 258 } 259 bind(L_on_abort); 260 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 261 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 262 } 263 bind(L_decrement_retry); 264 if (RTMRetryCount > 0) { 265 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 266 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 267 } 268 } 269 270 // Use RTM for inflating locks 271 // inputs: objReg (object to lock) 272 // boxReg (on-stack box address (displaced header location) - KILLED) 273 // tmpReg (ObjectMonitor address + markWord::monitor_value) 274 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 275 Register scrReg, Register retry_on_busy_count_Reg, 276 Register retry_on_abort_count_Reg, 277 RTMLockingCounters* rtm_counters, 278 Metadata* method_data, bool profile_rtm, 279 Label& DONE_LABEL) { 280 assert(UseRTMLocking, "why call this otherwise?"); 281 assert(tmpReg == rax, ""); 282 assert(scrReg == rdx, ""); 283 Label L_rtm_retry, L_decrement_retry, L_on_abort; 284 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 285 286 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 287 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 288 movptr(boxReg, tmpReg); // Save ObjectMonitor address 289 290 if (RTMRetryCount > 0) { 291 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 292 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 293 bind(L_rtm_retry); 294 } 295 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 296 Label L_noincrement; 297 if (RTMTotalCountIncrRate > 1) { 298 // tmpReg, scrReg and flags are killed 299 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 300 } 301 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 302 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 303 bind(L_noincrement); 304 } 305 xbegin(L_on_abort); 306 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 307 movptr(tmpReg, Address(tmpReg, owner_offset)); 308 testptr(tmpReg, tmpReg); 309 jcc(Assembler::zero, DONE_LABEL); 310 if (UseRTMXendForLockBusy) { 311 xend(); 312 jmp(L_decrement_retry); 313 } 314 else { 315 xabort(0); 316 } 317 bind(L_on_abort); 318 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 319 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 320 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 321 } 322 if (RTMRetryCount > 0) { 323 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 324 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 325 } 326 327 movptr(tmpReg, Address(boxReg, owner_offset)) ; 328 testptr(tmpReg, tmpReg) ; 329 jccb(Assembler::notZero, L_decrement_retry) ; 330 331 // Appears unlocked - try to swing _owner from null to non-null. 332 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 333 #ifdef _LP64 334 Register threadReg = r15_thread; 335 #else 336 get_thread(scrReg); 337 Register threadReg = scrReg; 338 #endif 339 lock(); 340 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 341 342 if (RTMRetryCount > 0) { 343 // success done else retry 344 jccb(Assembler::equal, DONE_LABEL) ; 345 bind(L_decrement_retry); 346 // Spin and retry if lock is busy. 347 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 348 } 349 else { 350 bind(L_decrement_retry); 351 } 352 } 353 354 #endif // INCLUDE_RTM_OPT 355 356 // fast_lock and fast_unlock used by C2 357 358 // Because the transitions from emitted code to the runtime 359 // monitorenter/exit helper stubs are so slow it's critical that 360 // we inline both the stack-locking fast path and the inflated fast path. 361 // 362 // See also: cmpFastLock and cmpFastUnlock. 363 // 364 // What follows is a specialized inline transliteration of the code 365 // in enter() and exit(). If we're concerned about I$ bloat another 366 // option would be to emit TrySlowEnter and TrySlowExit methods 367 // at startup-time. These methods would accept arguments as 368 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 369 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 370 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 371 // In practice, however, the # of lock sites is bounded and is usually small. 372 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 373 // if the processor uses simple bimodal branch predictors keyed by EIP 374 // Since the helper routines would be called from multiple synchronization 375 // sites. 376 // 377 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 378 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 379 // to those specialized methods. That'd give us a mostly platform-independent 380 // implementation that the JITs could optimize and inline at their pleasure. 381 // Done correctly, the only time we'd need to cross to native could would be 382 // to park() or unpark() threads. We'd also need a few more unsafe operators 383 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 384 // (b) explicit barriers or fence operations. 385 // 386 // TODO: 387 // 388 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 389 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 390 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 391 // the lock operators would typically be faster than reifying Self. 392 // 393 // * Ideally I'd define the primitives as: 394 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 395 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 396 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 397 // Instead, we're stuck with a rather awkward and brittle register assignments below. 398 // Furthermore the register assignments are overconstrained, possibly resulting in 399 // sub-optimal code near the synchronization site. 400 // 401 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 402 // Alternately, use a better sp-proximity test. 403 // 404 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 405 // Either one is sufficient to uniquely identify a thread. 406 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 407 // 408 // * Intrinsify notify() and notifyAll() for the common cases where the 409 // object is locked by the calling thread but the waitlist is empty. 410 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 411 // 412 // * use jccb and jmpb instead of jcc and jmp to improve code density. 413 // But beware of excessive branch density on AMD Opterons. 414 // 415 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 416 // or failure of the fast path. If the fast path fails then we pass 417 // control to the slow path, typically in C. In fast_lock and 418 // fast_unlock we often branch to DONE_LABEL, just to find that C2 419 // will emit a conditional branch immediately after the node. 420 // So we have branches to branches and lots of ICC.ZF games. 421 // Instead, it might be better to have C2 pass a "FailureLabel" 422 // into fast_lock and fast_unlock. In the case of success, control 423 // will drop through the node. ICC.ZF is undefined at exit. 424 // In the case of failure, the node will branch directly to the 425 // FailureLabel 426 427 428 // obj: object to lock 429 // box: on-stack box address (displaced header location) - KILLED 430 // rax,: tmp -- KILLED 431 // scr: tmp -- KILLED 432 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 433 Register scrReg, Register cx1Reg, Register cx2Reg, 434 BiasedLockingCounters* counters, 435 RTMLockingCounters* rtm_counters, 436 RTMLockingCounters* stack_rtm_counters, 437 Metadata* method_data, 438 bool use_rtm, bool profile_rtm) { 439 // Ensure the register assignments are disjoint 440 assert(tmpReg == rax, ""); 441 442 if (use_rtm) { 443 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 444 } else { 445 assert(cx1Reg == noreg, ""); 446 assert(cx2Reg == noreg, ""); 447 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 448 } 449 450 if (counters != NULL) { 451 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 452 } 453 454 // Possible cases that we'll encounter in fast_lock 455 // ------------------------------------------------ 456 // * Inflated 457 // -- unlocked 458 // -- Locked 459 // = by self 460 // = by other 461 // * biased 462 // -- by Self 463 // -- by other 464 // * neutral 465 // * stack-locked 466 // -- by self 467 // = sp-proximity test hits 468 // = sp-proximity test generates false-negative 469 // -- by other 470 // 471 472 Label IsInflated, DONE_LABEL; 473 474 // it's stack-locked, biased or neutral 475 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 476 // order to reduce the number of conditional branches in the most common cases. 477 // Beware -- there's a subtle invariant that fetch of the markword 478 // at [FETCH], below, will never observe a biased encoding (*101b). 479 // If this invariant is not held we risk exclusion (safety) failure. 480 if (UseBiasedLocking && !UseOptoBiasInlining) { 481 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); 482 } 483 484 #if INCLUDE_RTM_OPT 485 if (UseRTMForStackLocks && use_rtm) { 486 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 487 stack_rtm_counters, method_data, profile_rtm, 488 DONE_LABEL, IsInflated); 489 } 490 #endif // INCLUDE_RTM_OPT 491 492 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 493 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 494 jccb(Assembler::notZero, IsInflated); 495 496 // Attempt stack-locking ... 497 orptr (tmpReg, markWord::unlocked_value); 498 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 499 lock(); 500 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 501 if (counters != NULL) { 502 cond_inc32(Assembler::equal, 503 ExternalAddress((address)counters->fast_path_entry_count_addr())); 504 } 505 jcc(Assembler::equal, DONE_LABEL); // Success 506 507 // Recursive locking. 508 // The object is stack-locked: markword contains stack pointer to BasicLock. 509 // Locked by current thread if difference with current SP is less than one page. 510 subptr(tmpReg, rsp); 511 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 512 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 513 movptr(Address(boxReg, 0), tmpReg); 514 if (counters != NULL) { 515 cond_inc32(Assembler::equal, 516 ExternalAddress((address)counters->fast_path_entry_count_addr())); 517 } 518 jmp(DONE_LABEL); 519 520 bind(IsInflated); 521 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 522 523 #if INCLUDE_RTM_OPT 524 // Use the same RTM locking code in 32- and 64-bit VM. 525 if (use_rtm) { 526 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 527 rtm_counters, method_data, profile_rtm, DONE_LABEL); 528 } else { 529 #endif // INCLUDE_RTM_OPT 530 531 #ifndef _LP64 532 // The object is inflated. 533 534 // boxReg refers to the on-stack BasicLock in the current frame. 535 // We'd like to write: 536 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 537 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 538 // additional latency as we have another ST in the store buffer that must drain. 539 540 // avoid ST-before-CAS 541 // register juggle because we need tmpReg for cmpxchgptr below 542 movptr(scrReg, boxReg); 543 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 544 545 // Optimistic form: consider XORL tmpReg,tmpReg 546 movptr(tmpReg, NULL_WORD); 547 548 // Appears unlocked - try to swing _owner from null to non-null. 549 // Ideally, I'd manifest "Self" with get_thread and then attempt 550 // to CAS the register containing Self into m->Owner. 551 // But we don't have enough registers, so instead we can either try to CAS 552 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 553 // we later store "Self" into m->Owner. Transiently storing a stack address 554 // (rsp or the address of the box) into m->owner is harmless. 555 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 556 lock(); 557 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 558 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 559 // If we weren't able to swing _owner from NULL to the BasicLock 560 // then take the slow path. 561 jccb (Assembler::notZero, DONE_LABEL); 562 // update _owner from BasicLock to thread 563 get_thread (scrReg); // beware: clobbers ICCs 564 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 565 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 566 567 // If the CAS fails we can either retry or pass control to the slow path. 568 // We use the latter tactic. 569 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 570 // If the CAS was successful ... 571 // Self has acquired the lock 572 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 573 // Intentional fall-through into DONE_LABEL ... 574 #else // _LP64 575 // It's inflated and we use scrReg for ObjectMonitor* in this section. 576 movq(scrReg, tmpReg); 577 xorq(tmpReg, tmpReg); 578 lock(); 579 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 580 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 581 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 582 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 583 // Intentional fall-through into DONE_LABEL ... 584 // Propagate ICC.ZF from CAS above into DONE_LABEL. 585 #endif // _LP64 586 #if INCLUDE_RTM_OPT 587 } // use_rtm() 588 #endif 589 // DONE_LABEL is a hot target - we'd really like to place it at the 590 // start of cache line by padding with NOPs. 591 // See the AMD and Intel software optimization manuals for the 592 // most efficient "long" NOP encodings. 593 // Unfortunately none of our alignment mechanisms suffice. 594 bind(DONE_LABEL); 595 596 // At DONE_LABEL the icc ZFlag is set as follows ... 597 // fast_unlock uses the same protocol. 598 // ZFlag == 1 -> Success 599 // ZFlag == 0 -> Failure - force control through the slow path 600 } 601 602 // obj: object to unlock 603 // box: box address (displaced header location), killed. Must be EAX. 604 // tmp: killed, cannot be obj nor box. 605 // 606 // Some commentary on balanced locking: 607 // 608 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 609 // Methods that don't have provably balanced locking are forced to run in the 610 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 611 // The interpreter provides two properties: 612 // I1: At return-time the interpreter automatically and quietly unlocks any 613 // objects acquired the current activation (frame). Recall that the 614 // interpreter maintains an on-stack list of locks currently held by 615 // a frame. 616 // I2: If a method attempts to unlock an object that is not held by the 617 // the frame the interpreter throws IMSX. 618 // 619 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 620 // B() doesn't have provably balanced locking so it runs in the interpreter. 621 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 622 // is still locked by A(). 623 // 624 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 625 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 626 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 627 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 628 // Arguably given that the spec legislates the JNI case as undefined our implementation 629 // could reasonably *avoid* checking owner in fast_unlock(). 630 // In the interest of performance we elide m->Owner==Self check in unlock. 631 // A perfectly viable alternative is to elide the owner check except when 632 // Xcheck:jni is enabled. 633 634 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 635 assert(boxReg == rax, ""); 636 assert_different_registers(objReg, boxReg, tmpReg); 637 638 Label DONE_LABEL, Stacked, CheckSucc; 639 640 // Critically, the biased locking test must have precedence over 641 // and appear before the (box->dhw == 0) recursive stack-lock test. 642 if (UseBiasedLocking && !UseOptoBiasInlining) { 643 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 644 } 645 646 #if INCLUDE_RTM_OPT 647 if (UseRTMForStackLocks && use_rtm) { 648 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 649 Label L_regular_unlock; 650 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 651 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 652 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 653 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 654 xend(); // otherwise end... 655 jmp(DONE_LABEL); // ... and we're done 656 bind(L_regular_unlock); 657 } 658 #endif 659 660 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 661 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 662 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 663 testptr(tmpReg, markWord::monitor_value); // Inflated? 664 jccb (Assembler::zero, Stacked); 665 666 // It's inflated. 667 #if INCLUDE_RTM_OPT 668 if (use_rtm) { 669 Label L_regular_inflated_unlock; 670 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 671 movptr(boxReg, Address(tmpReg, owner_offset)); 672 testptr(boxReg, boxReg); 673 jccb(Assembler::notZero, L_regular_inflated_unlock); 674 xend(); 675 jmpb(DONE_LABEL); 676 bind(L_regular_inflated_unlock); 677 } 678 #endif 679 680 // Despite our balanced locking property we still check that m->_owner == Self 681 // as java routines or native JNI code called by this thread might 682 // have released the lock. 683 // Refer to the comments in synchronizer.cpp for how we might encode extra 684 // state in _succ so we can avoid fetching EntryList|cxq. 685 // 686 // I'd like to add more cases in fast_lock() and fast_unlock() -- 687 // such as recursive enter and exit -- but we have to be wary of 688 // I$ bloat, T$ effects and BP$ effects. 689 // 690 // If there's no contention try a 1-0 exit. That is, exit without 691 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 692 // we detect and recover from the race that the 1-0 exit admits. 693 // 694 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 695 // before it STs null into _owner, releasing the lock. Updates 696 // to data protected by the critical section must be visible before 697 // we drop the lock (and thus before any other thread could acquire 698 // the lock and observe the fields protected by the lock). 699 // IA32's memory-model is SPO, so STs are ordered with respect to 700 // each other and there's no need for an explicit barrier (fence). 701 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 702 #ifndef _LP64 703 get_thread (boxReg); 704 705 // Note that we could employ various encoding schemes to reduce 706 // the number of loads below (currently 4) to just 2 or 3. 707 // Refer to the comments in synchronizer.cpp. 708 // In practice the chain of fetches doesn't seem to impact performance, however. 709 xorptr(boxReg, boxReg); 710 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 711 jccb (Assembler::notZero, DONE_LABEL); 712 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 713 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 714 jccb (Assembler::notZero, CheckSucc); 715 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 716 jmpb (DONE_LABEL); 717 718 bind (Stacked); 719 // It's not inflated and it's not recursively stack-locked and it's not biased. 720 // It must be stack-locked. 721 // Try to reset the header to displaced header. 722 // The "box" value on the stack is stable, so we can reload 723 // and be assured we observe the same value as above. 724 movptr(tmpReg, Address(boxReg, 0)); 725 lock(); 726 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 727 // Intention fall-thru into DONE_LABEL 728 729 // DONE_LABEL is a hot target - we'd really like to place it at the 730 // start of cache line by padding with NOPs. 731 // See the AMD and Intel software optimization manuals for the 732 // most efficient "long" NOP encodings. 733 // Unfortunately none of our alignment mechanisms suffice. 734 bind (CheckSucc); 735 #else // _LP64 736 // It's inflated 737 xorptr(boxReg, boxReg); 738 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 739 jccb (Assembler::notZero, DONE_LABEL); 740 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 741 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 742 jccb (Assembler::notZero, CheckSucc); 743 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 744 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 745 jmpb (DONE_LABEL); 746 747 // Try to avoid passing control into the slow_path ... 748 Label LSuccess, LGoSlowPath ; 749 bind (CheckSucc); 750 751 // The following optional optimization can be elided if necessary 752 // Effectively: if (succ == null) goto slow path 753 // The code reduces the window for a race, however, 754 // and thus benefits performance. 755 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 756 jccb (Assembler::zero, LGoSlowPath); 757 758 xorptr(boxReg, boxReg); 759 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 760 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 761 762 // Memory barrier/fence 763 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 764 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 765 // This is faster on Nehalem and AMD Shanghai/Barcelona. 766 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 767 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 768 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 769 lock(); addl(Address(rsp, 0), 0); 770 771 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 772 jccb (Assembler::notZero, LSuccess); 773 774 // Rare inopportune interleaving - race. 775 // The successor vanished in the small window above. 776 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 777 // We need to ensure progress and succession. 778 // Try to reacquire the lock. 779 // If that fails then the new owner is responsible for succession and this 780 // thread needs to take no further action and can exit via the fast path (success). 781 // If the re-acquire succeeds then pass control into the slow path. 782 // As implemented, this latter mode is horrible because we generated more 783 // coherence traffic on the lock *and* artifically extended the critical section 784 // length while by virtue of passing control into the slow path. 785 786 // box is really RAX -- the following CMPXCHG depends on that binding 787 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 788 lock(); 789 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 790 // There's no successor so we tried to regrab the lock. 791 // If that didn't work, then another thread grabbed the 792 // lock so we're done (and exit was a success). 793 jccb (Assembler::notEqual, LSuccess); 794 // Intentional fall-through into slow path 795 796 bind (LGoSlowPath); 797 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 798 jmpb (DONE_LABEL); 799 800 bind (LSuccess); 801 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 802 jmpb (DONE_LABEL); 803 804 bind (Stacked); 805 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 806 lock(); 807 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 808 809 #endif 810 bind(DONE_LABEL); 811 } 812 813 //------------------------------------------------------------------------------------------- 814 // Generic instructions support for use in .ad files C2 code generation 815 816 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 817 if (dst != src) { 818 movdqu(dst, src); 819 } 820 if (opcode == Op_AbsVD) { 821 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 822 } else { 823 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 824 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 825 } 826 } 827 828 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 829 if (opcode == Op_AbsVD) { 830 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 831 } else { 832 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 833 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 834 } 835 } 836 837 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 838 if (dst != src) { 839 movdqu(dst, src); 840 } 841 if (opcode == Op_AbsVF) { 842 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 843 } else { 844 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 845 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 846 } 847 } 848 849 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 850 if (opcode == Op_AbsVF) { 851 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 852 } else { 853 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 854 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 855 } 856 } 857 858 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 859 if (sign) { 860 pmovsxbw(dst, src); 861 } else { 862 pmovzxbw(dst, src); 863 } 864 } 865 866 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 867 if (sign) { 868 vpmovsxbw(dst, src, vector_len); 869 } else { 870 vpmovzxbw(dst, src, vector_len); 871 } 872 } 873 874 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src) { 875 if (opcode == Op_RShiftVI) { 876 psrad(dst, src); 877 } else if (opcode == Op_LShiftVI) { 878 pslld(dst, src); 879 } else { 880 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 881 psrld(dst, src); 882 } 883 } 884 885 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 886 if (opcode == Op_RShiftVI) { 887 vpsrad(dst, nds, src, vector_len); 888 } else if (opcode == Op_LShiftVI) { 889 vpslld(dst, nds, src, vector_len); 890 } else { 891 assert((opcode == Op_URShiftVI),"opcode should be Op_URShiftVI"); 892 vpsrld(dst, nds, src, vector_len); 893 } 894 } 895 896 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src) { 897 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { 898 psraw(dst, src); 899 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { 900 psllw(dst, src); 901 } else { 902 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); 903 psrlw(dst, src); 904 } 905 } 906 907 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 908 if ((opcode == Op_RShiftVS) || (opcode == Op_RShiftVB)) { 909 vpsraw(dst, nds, src, vector_len); 910 } else if ((opcode == Op_LShiftVS) || (opcode == Op_LShiftVB)) { 911 vpsllw(dst, nds, src, vector_len); 912 } else { 913 assert(((opcode == Op_URShiftVS) || (opcode == Op_URShiftVB)),"opcode should be one of Op_URShiftVS or Op_URShiftVB"); 914 vpsrlw(dst, nds, src, vector_len); 915 } 916 } 917 918 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src) { 919 if (opcode == Op_RShiftVL) { 920 psrlq(dst, src); // using srl to implement sra on pre-avs512 systems 921 } else if (opcode == Op_LShiftVL) { 922 psllq(dst, src); 923 } else { 924 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 925 psrlq(dst, src); 926 } 927 } 928 929 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 930 if (opcode == Op_RShiftVL) { 931 evpsraq(dst, nds, src, vector_len); 932 } else if (opcode == Op_LShiftVL) { 933 vpsllq(dst, nds, src, vector_len); 934 } else { 935 assert((opcode == Op_URShiftVL),"opcode should be Op_URShiftVL"); 936 vpsrlq(dst, nds, src, vector_len); 937 } 938 } 939 940 // Reductions for vectors of ints, longs, floats, and doubles. 941 942 void C2_MacroAssembler::reduce_operation_128(int opcode, XMMRegister dst, XMMRegister src) { 943 int vector_len = Assembler::AVX_128bit; 944 945 switch (opcode) { 946 case Op_AndReductionV: pand(dst, src); break; 947 case Op_OrReductionV: por (dst, src); break; 948 case Op_XorReductionV: pxor(dst, src); break; 949 950 case Op_AddReductionVF: addss(dst, src); break; 951 case Op_AddReductionVD: addsd(dst, src); break; 952 case Op_AddReductionVI: paddd(dst, src); break; 953 case Op_AddReductionVL: paddq(dst, src); break; 954 955 case Op_MulReductionVF: mulss(dst, src); break; 956 case Op_MulReductionVD: mulsd(dst, src); break; 957 case Op_MulReductionVI: pmulld(dst, src); break; 958 case Op_MulReductionVL: vpmullq(dst, dst, src, vector_len); break; 959 960 default: assert(false, "wrong opcode"); 961 } 962 } 963 964 void C2_MacroAssembler::reduce_operation_256(int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 965 int vector_len = Assembler::AVX_256bit; 966 967 switch (opcode) { 968 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 969 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 970 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 971 972 case Op_AddReductionVI: vpaddd(dst, src1, src2, vector_len); break; 973 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 974 975 case Op_MulReductionVI: vpmulld(dst, src1, src2, vector_len); break; 976 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 977 978 default: assert(false, "wrong opcode"); 979 } 980 } 981 982 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 983 XMMRegister dst, XMMRegister src, 984 XMMRegister vtmp1, XMMRegister vtmp2) { 985 switch (opcode) { 986 case Op_AddReductionVF: 987 case Op_MulReductionVF: 988 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 989 break; 990 991 case Op_AddReductionVD: 992 case Op_MulReductionVD: 993 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 994 break; 995 996 default: assert(false, "wrong opcode"); 997 } 998 } 999 1000 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1001 Register dst, Register src1, XMMRegister src2, 1002 XMMRegister vtmp1, XMMRegister vtmp2) { 1003 switch (vlen) { 1004 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1005 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1006 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1007 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1008 1009 default: assert(false, "wrong vector length"); 1010 } 1011 } 1012 1013 #ifdef _LP64 1014 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1015 Register dst, Register src1, XMMRegister src2, 1016 XMMRegister vtmp1, XMMRegister vtmp2) { 1017 switch (vlen) { 1018 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1019 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1020 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1021 1022 default: assert(false, "wrong vector length"); 1023 } 1024 } 1025 #endif // _LP64 1026 1027 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1028 switch (vlen) { 1029 case 2: 1030 assert(vtmp2 == xnoreg, ""); 1031 reduce2F(opcode, dst, src, vtmp1); 1032 break; 1033 case 4: 1034 assert(vtmp2 == xnoreg, ""); 1035 reduce4F(opcode, dst, src, vtmp1); 1036 break; 1037 case 8: 1038 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1039 break; 1040 case 16: 1041 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1042 break; 1043 default: assert(false, "wrong vector length"); 1044 } 1045 } 1046 1047 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1048 switch (vlen) { 1049 case 2: 1050 assert(vtmp2 == xnoreg, ""); 1051 reduce2D(opcode, dst, src, vtmp1); 1052 break; 1053 case 4: 1054 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1055 break; 1056 case 8: 1057 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1058 break; 1059 default: assert(false, "wrong vector length"); 1060 } 1061 } 1062 1063 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1064 if (opcode == Op_AddReductionVI) { 1065 if (vtmp1 != src2) { 1066 movdqu(vtmp1, src2); 1067 } 1068 phaddd(vtmp1, vtmp1); 1069 } else { 1070 pshufd(vtmp1, src2, 0x1); 1071 reduce_operation_128(opcode, vtmp1, src2); 1072 } 1073 movdl(vtmp2, src1); 1074 reduce_operation_128(opcode, vtmp1, vtmp2); 1075 movdl(dst, vtmp1); 1076 } 1077 1078 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1079 if (opcode == Op_AddReductionVI) { 1080 if (vtmp1 != src2) { 1081 movdqu(vtmp1, src2); 1082 } 1083 phaddd(vtmp1, src2); 1084 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1085 } else { 1086 pshufd(vtmp2, src2, 0xE); 1087 reduce_operation_128(opcode, vtmp2, src2); 1088 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1089 } 1090 } 1091 1092 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1093 if (opcode == Op_AddReductionVI) { 1094 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1095 vextracti128_high(vtmp2, vtmp1); 1096 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1097 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1098 } else { 1099 vextracti128_high(vtmp1, src2); 1100 reduce_operation_128(opcode, vtmp1, src2); 1101 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1102 } 1103 } 1104 1105 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1106 vextracti64x4_high(vtmp2, src2); 1107 reduce_operation_256(opcode, vtmp2, vtmp2, src2); 1108 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1109 } 1110 1111 #ifdef _LP64 1112 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1113 pshufd(vtmp2, src2, 0xE); 1114 reduce_operation_128(opcode, vtmp2, src2); 1115 movdq(vtmp1, src1); 1116 reduce_operation_128(opcode, vtmp1, vtmp2); 1117 movdq(dst, vtmp1); 1118 } 1119 1120 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1121 vextracti128_high(vtmp1, src2); 1122 reduce_operation_128(opcode, vtmp1, src2); 1123 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1124 } 1125 1126 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1127 vextracti64x4_high(vtmp2, src2); 1128 reduce_operation_256(opcode, vtmp2, vtmp2, src2); 1129 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1130 } 1131 #endif // _LP64 1132 1133 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1134 reduce_operation_128(opcode, dst, src); 1135 pshufd(vtmp, src, 0x1); 1136 reduce_operation_128(opcode, dst, vtmp); 1137 } 1138 1139 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1140 reduce2F(opcode, dst, src, vtmp); 1141 pshufd(vtmp, src, 0x2); 1142 reduce_operation_128(opcode, dst, vtmp); 1143 pshufd(vtmp, src, 0x3); 1144 reduce_operation_128(opcode, dst, vtmp); 1145 } 1146 1147 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1148 reduce4F(opcode, dst, src, vtmp2); 1149 vextractf128_high(vtmp2, src); 1150 reduce4F(opcode, dst, vtmp2, vtmp1); 1151 } 1152 1153 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1154 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1155 vextracti64x4_high(vtmp1, src); 1156 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1157 } 1158 1159 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1160 reduce_operation_128(opcode, dst, src); 1161 pshufd(vtmp, src, 0xE); 1162 reduce_operation_128(opcode, dst, vtmp); 1163 } 1164 1165 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1166 reduce2D(opcode, dst, src, vtmp2); 1167 vextractf128_high(vtmp2, src); 1168 reduce2D(opcode, dst, vtmp2, vtmp1); 1169 } 1170 1171 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1172 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1173 vextracti64x4_high(vtmp1, src); 1174 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1175 } 1176 1177 //------------------------------------------------------------------------------------------- 1178 1179 // IndexOf for constant substrings with size >= 8 chars 1180 // which don't need to be loaded through stack. 1181 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 1182 Register cnt1, Register cnt2, 1183 int int_cnt2, Register result, 1184 XMMRegister vec, Register tmp, 1185 int ae) { 1186 ShortBranchVerifier sbv(this); 1187 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1188 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1189 1190 // This method uses the pcmpestri instruction with bound registers 1191 // inputs: 1192 // xmm - substring 1193 // rax - substring length (elements count) 1194 // mem - scanned string 1195 // rdx - string length (elements count) 1196 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 1197 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 1198 // outputs: 1199 // rcx - matched index in string 1200 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1201 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 1202 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 1203 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 1204 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 1205 1206 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 1207 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 1208 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 1209 1210 // Note, inline_string_indexOf() generates checks: 1211 // if (substr.count > string.count) return -1; 1212 // if (substr.count == 0) return 0; 1213 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 1214 1215 // Load substring. 1216 if (ae == StrIntrinsicNode::UL) { 1217 pmovzxbw(vec, Address(str2, 0)); 1218 } else { 1219 movdqu(vec, Address(str2, 0)); 1220 } 1221 movl(cnt2, int_cnt2); 1222 movptr(result, str1); // string addr 1223 1224 if (int_cnt2 > stride) { 1225 jmpb(SCAN_TO_SUBSTR); 1226 1227 // Reload substr for rescan, this code 1228 // is executed only for large substrings (> 8 chars) 1229 bind(RELOAD_SUBSTR); 1230 if (ae == StrIntrinsicNode::UL) { 1231 pmovzxbw(vec, Address(str2, 0)); 1232 } else { 1233 movdqu(vec, Address(str2, 0)); 1234 } 1235 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 1236 1237 bind(RELOAD_STR); 1238 // We came here after the beginning of the substring was 1239 // matched but the rest of it was not so we need to search 1240 // again. Start from the next element after the previous match. 1241 1242 // cnt2 is number of substring reminding elements and 1243 // cnt1 is number of string reminding elements when cmp failed. 1244 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 1245 subl(cnt1, cnt2); 1246 addl(cnt1, int_cnt2); 1247 movl(cnt2, int_cnt2); // Now restore cnt2 1248 1249 decrementl(cnt1); // Shift to next element 1250 cmpl(cnt1, cnt2); 1251 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1252 1253 addptr(result, (1<<scale1)); 1254 1255 } // (int_cnt2 > 8) 1256 1257 // Scan string for start of substr in 16-byte vectors 1258 bind(SCAN_TO_SUBSTR); 1259 pcmpestri(vec, Address(result, 0), mode); 1260 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 1261 subl(cnt1, stride); 1262 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 1263 cmpl(cnt1, cnt2); 1264 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1265 addptr(result, 16); 1266 jmpb(SCAN_TO_SUBSTR); 1267 1268 // Found a potential substr 1269 bind(FOUND_CANDIDATE); 1270 // Matched whole vector if first element matched (tmp(rcx) == 0). 1271 if (int_cnt2 == stride) { 1272 jccb(Assembler::overflow, RET_FOUND); // OF == 1 1273 } else { // int_cnt2 > 8 1274 jccb(Assembler::overflow, FOUND_SUBSTR); 1275 } 1276 // After pcmpestri tmp(rcx) contains matched element index 1277 // Compute start addr of substr 1278 lea(result, Address(result, tmp, scale1)); 1279 1280 // Make sure string is still long enough 1281 subl(cnt1, tmp); 1282 cmpl(cnt1, cnt2); 1283 if (int_cnt2 == stride) { 1284 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 1285 } else { // int_cnt2 > 8 1286 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 1287 } 1288 // Left less then substring. 1289 1290 bind(RET_NOT_FOUND); 1291 movl(result, -1); 1292 jmp(EXIT); 1293 1294 if (int_cnt2 > stride) { 1295 // This code is optimized for the case when whole substring 1296 // is matched if its head is matched. 1297 bind(MATCH_SUBSTR_HEAD); 1298 pcmpestri(vec, Address(result, 0), mode); 1299 // Reload only string if does not match 1300 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 1301 1302 Label CONT_SCAN_SUBSTR; 1303 // Compare the rest of substring (> 8 chars). 1304 bind(FOUND_SUBSTR); 1305 // First 8 chars are already matched. 1306 negptr(cnt2); 1307 addptr(cnt2, stride); 1308 1309 bind(SCAN_SUBSTR); 1310 subl(cnt1, stride); 1311 cmpl(cnt2, -stride); // Do not read beyond substring 1312 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 1313 // Back-up strings to avoid reading beyond substring: 1314 // cnt1 = cnt1 - cnt2 + 8 1315 addl(cnt1, cnt2); // cnt2 is negative 1316 addl(cnt1, stride); 1317 movl(cnt2, stride); negptr(cnt2); 1318 bind(CONT_SCAN_SUBSTR); 1319 if (int_cnt2 < (int)G) { 1320 int tail_off1 = int_cnt2<<scale1; 1321 int tail_off2 = int_cnt2<<scale2; 1322 if (ae == StrIntrinsicNode::UL) { 1323 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 1324 } else { 1325 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 1326 } 1327 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 1328 } else { 1329 // calculate index in register to avoid integer overflow (int_cnt2*2) 1330 movl(tmp, int_cnt2); 1331 addptr(tmp, cnt2); 1332 if (ae == StrIntrinsicNode::UL) { 1333 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 1334 } else { 1335 movdqu(vec, Address(str2, tmp, scale2, 0)); 1336 } 1337 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 1338 } 1339 // Need to reload strings pointers if not matched whole vector 1340 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 1341 addptr(cnt2, stride); 1342 jcc(Assembler::negative, SCAN_SUBSTR); 1343 // Fall through if found full substring 1344 1345 } // (int_cnt2 > 8) 1346 1347 bind(RET_FOUND); 1348 // Found result if we matched full small substring. 1349 // Compute substr offset 1350 subptr(result, str1); 1351 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1352 shrl(result, 1); // index 1353 } 1354 bind(EXIT); 1355 1356 } // string_indexofC8 1357 1358 // Small strings are loaded through stack if they cross page boundary. 1359 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 1360 Register cnt1, Register cnt2, 1361 int int_cnt2, Register result, 1362 XMMRegister vec, Register tmp, 1363 int ae) { 1364 ShortBranchVerifier sbv(this); 1365 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1366 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 1367 1368 // 1369 // int_cnt2 is length of small (< 8 chars) constant substring 1370 // or (-1) for non constant substring in which case its length 1371 // is in cnt2 register. 1372 // 1373 // Note, inline_string_indexOf() generates checks: 1374 // if (substr.count > string.count) return -1; 1375 // if (substr.count == 0) return 0; 1376 // 1377 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 1378 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 1379 // This method uses the pcmpestri instruction with bound registers 1380 // inputs: 1381 // xmm - substring 1382 // rax - substring length (elements count) 1383 // mem - scanned string 1384 // rdx - string length (elements count) 1385 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 1386 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 1387 // outputs: 1388 // rcx - matched index in string 1389 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1390 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 1391 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 1392 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 1393 1394 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 1395 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 1396 FOUND_CANDIDATE; 1397 1398 { //======================================================== 1399 // We don't know where these strings are located 1400 // and we can't read beyond them. Load them through stack. 1401 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 1402 1403 movptr(tmp, rsp); // save old SP 1404 1405 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 1406 if (int_cnt2 == (1>>scale2)) { // One byte 1407 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 1408 load_unsigned_byte(result, Address(str2, 0)); 1409 movdl(vec, result); // move 32 bits 1410 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 1411 // Not enough header space in 32-bit VM: 12+3 = 15. 1412 movl(result, Address(str2, -1)); 1413 shrl(result, 8); 1414 movdl(vec, result); // move 32 bits 1415 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 1416 load_unsigned_short(result, Address(str2, 0)); 1417 movdl(vec, result); // move 32 bits 1418 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 1419 movdl(vec, Address(str2, 0)); // move 32 bits 1420 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 1421 movq(vec, Address(str2, 0)); // move 64 bits 1422 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 1423 // Array header size is 12 bytes in 32-bit VM 1424 // + 6 bytes for 3 chars == 18 bytes, 1425 // enough space to load vec and shift. 1426 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 1427 if (ae == StrIntrinsicNode::UL) { 1428 int tail_off = int_cnt2-8; 1429 pmovzxbw(vec, Address(str2, tail_off)); 1430 psrldq(vec, -2*tail_off); 1431 } 1432 else { 1433 int tail_off = int_cnt2*(1<<scale2); 1434 movdqu(vec, Address(str2, tail_off-16)); 1435 psrldq(vec, 16-tail_off); 1436 } 1437 } 1438 } else { // not constant substring 1439 cmpl(cnt2, stride); 1440 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 1441 1442 // We can read beyond string if srt+16 does not cross page boundary 1443 // since heaps are aligned and mapped by pages. 1444 assert(os::vm_page_size() < (int)G, "default page should be small"); 1445 movl(result, str2); // We need only low 32 bits 1446 andl(result, (os::vm_page_size()-1)); 1447 cmpl(result, (os::vm_page_size()-16)); 1448 jccb(Assembler::belowEqual, CHECK_STR); 1449 1450 // Move small strings to stack to allow load 16 bytes into vec. 1451 subptr(rsp, 16); 1452 int stk_offset = wordSize-(1<<scale2); 1453 push(cnt2); 1454 1455 bind(COPY_SUBSTR); 1456 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 1457 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 1458 movb(Address(rsp, cnt2, scale2, stk_offset), result); 1459 } else if (ae == StrIntrinsicNode::UU) { 1460 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 1461 movw(Address(rsp, cnt2, scale2, stk_offset), result); 1462 } 1463 decrement(cnt2); 1464 jccb(Assembler::notZero, COPY_SUBSTR); 1465 1466 pop(cnt2); 1467 movptr(str2, rsp); // New substring address 1468 } // non constant 1469 1470 bind(CHECK_STR); 1471 cmpl(cnt1, stride); 1472 jccb(Assembler::aboveEqual, BIG_STRINGS); 1473 1474 // Check cross page boundary. 1475 movl(result, str1); // We need only low 32 bits 1476 andl(result, (os::vm_page_size()-1)); 1477 cmpl(result, (os::vm_page_size()-16)); 1478 jccb(Assembler::belowEqual, BIG_STRINGS); 1479 1480 subptr(rsp, 16); 1481 int stk_offset = -(1<<scale1); 1482 if (int_cnt2 < 0) { // not constant 1483 push(cnt2); 1484 stk_offset += wordSize; 1485 } 1486 movl(cnt2, cnt1); 1487 1488 bind(COPY_STR); 1489 if (ae == StrIntrinsicNode::LL) { 1490 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 1491 movb(Address(rsp, cnt2, scale1, stk_offset), result); 1492 } else { 1493 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 1494 movw(Address(rsp, cnt2, scale1, stk_offset), result); 1495 } 1496 decrement(cnt2); 1497 jccb(Assembler::notZero, COPY_STR); 1498 1499 if (int_cnt2 < 0) { // not constant 1500 pop(cnt2); 1501 } 1502 movptr(str1, rsp); // New string address 1503 1504 bind(BIG_STRINGS); 1505 // Load substring. 1506 if (int_cnt2 < 0) { // -1 1507 if (ae == StrIntrinsicNode::UL) { 1508 pmovzxbw(vec, Address(str2, 0)); 1509 } else { 1510 movdqu(vec, Address(str2, 0)); 1511 } 1512 push(cnt2); // substr count 1513 push(str2); // substr addr 1514 push(str1); // string addr 1515 } else { 1516 // Small (< 8 chars) constant substrings are loaded already. 1517 movl(cnt2, int_cnt2); 1518 } 1519 push(tmp); // original SP 1520 1521 } // Finished loading 1522 1523 //======================================================== 1524 // Start search 1525 // 1526 1527 movptr(result, str1); // string addr 1528 1529 if (int_cnt2 < 0) { // Only for non constant substring 1530 jmpb(SCAN_TO_SUBSTR); 1531 1532 // SP saved at sp+0 1533 // String saved at sp+1*wordSize 1534 // Substr saved at sp+2*wordSize 1535 // Substr count saved at sp+3*wordSize 1536 1537 // Reload substr for rescan, this code 1538 // is executed only for large substrings (> 8 chars) 1539 bind(RELOAD_SUBSTR); 1540 movptr(str2, Address(rsp, 2*wordSize)); 1541 movl(cnt2, Address(rsp, 3*wordSize)); 1542 if (ae == StrIntrinsicNode::UL) { 1543 pmovzxbw(vec, Address(str2, 0)); 1544 } else { 1545 movdqu(vec, Address(str2, 0)); 1546 } 1547 // We came here after the beginning of the substring was 1548 // matched but the rest of it was not so we need to search 1549 // again. Start from the next element after the previous match. 1550 subptr(str1, result); // Restore counter 1551 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1552 shrl(str1, 1); 1553 } 1554 addl(cnt1, str1); 1555 decrementl(cnt1); // Shift to next element 1556 cmpl(cnt1, cnt2); 1557 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1558 1559 addptr(result, (1<<scale1)); 1560 } // non constant 1561 1562 // Scan string for start of substr in 16-byte vectors 1563 bind(SCAN_TO_SUBSTR); 1564 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 1565 pcmpestri(vec, Address(result, 0), mode); 1566 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 1567 subl(cnt1, stride); 1568 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 1569 cmpl(cnt1, cnt2); 1570 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 1571 addptr(result, 16); 1572 1573 bind(ADJUST_STR); 1574 cmpl(cnt1, stride); // Do not read beyond string 1575 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 1576 // Back-up string to avoid reading beyond string. 1577 lea(result, Address(result, cnt1, scale1, -16)); 1578 movl(cnt1, stride); 1579 jmpb(SCAN_TO_SUBSTR); 1580 1581 // Found a potential substr 1582 bind(FOUND_CANDIDATE); 1583 // After pcmpestri tmp(rcx) contains matched element index 1584 1585 // Make sure string is still long enough 1586 subl(cnt1, tmp); 1587 cmpl(cnt1, cnt2); 1588 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 1589 // Left less then substring. 1590 1591 bind(RET_NOT_FOUND); 1592 movl(result, -1); 1593 jmp(CLEANUP); 1594 1595 bind(FOUND_SUBSTR); 1596 // Compute start addr of substr 1597 lea(result, Address(result, tmp, scale1)); 1598 if (int_cnt2 > 0) { // Constant substring 1599 // Repeat search for small substring (< 8 chars) 1600 // from new point without reloading substring. 1601 // Have to check that we don't read beyond string. 1602 cmpl(tmp, stride-int_cnt2); 1603 jccb(Assembler::greater, ADJUST_STR); 1604 // Fall through if matched whole substring. 1605 } else { // non constant 1606 assert(int_cnt2 == -1, "should be != 0"); 1607 1608 addl(tmp, cnt2); 1609 // Found result if we matched whole substring. 1610 cmpl(tmp, stride); 1611 jcc(Assembler::lessEqual, RET_FOUND); 1612 1613 // Repeat search for small substring (<= 8 chars) 1614 // from new point 'str1' without reloading substring. 1615 cmpl(cnt2, stride); 1616 // Have to check that we don't read beyond string. 1617 jccb(Assembler::lessEqual, ADJUST_STR); 1618 1619 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 1620 // Compare the rest of substring (> 8 chars). 1621 movptr(str1, result); 1622 1623 cmpl(tmp, cnt2); 1624 // First 8 chars are already matched. 1625 jccb(Assembler::equal, CHECK_NEXT); 1626 1627 bind(SCAN_SUBSTR); 1628 pcmpestri(vec, Address(str1, 0), mode); 1629 // Need to reload strings pointers if not matched whole vector 1630 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 1631 1632 bind(CHECK_NEXT); 1633 subl(cnt2, stride); 1634 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 1635 addptr(str1, 16); 1636 if (ae == StrIntrinsicNode::UL) { 1637 addptr(str2, 8); 1638 } else { 1639 addptr(str2, 16); 1640 } 1641 subl(cnt1, stride); 1642 cmpl(cnt2, stride); // Do not read beyond substring 1643 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 1644 // Back-up strings to avoid reading beyond substring. 1645 1646 if (ae == StrIntrinsicNode::UL) { 1647 lea(str2, Address(str2, cnt2, scale2, -8)); 1648 lea(str1, Address(str1, cnt2, scale1, -16)); 1649 } else { 1650 lea(str2, Address(str2, cnt2, scale2, -16)); 1651 lea(str1, Address(str1, cnt2, scale1, -16)); 1652 } 1653 subl(cnt1, cnt2); 1654 movl(cnt2, stride); 1655 addl(cnt1, stride); 1656 bind(CONT_SCAN_SUBSTR); 1657 if (ae == StrIntrinsicNode::UL) { 1658 pmovzxbw(vec, Address(str2, 0)); 1659 } else { 1660 movdqu(vec, Address(str2, 0)); 1661 } 1662 jmp(SCAN_SUBSTR); 1663 1664 bind(RET_FOUND_LONG); 1665 movptr(str1, Address(rsp, wordSize)); 1666 } // non constant 1667 1668 bind(RET_FOUND); 1669 // Compute substr offset 1670 subptr(result, str1); 1671 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 1672 shrl(result, 1); // index 1673 } 1674 bind(CLEANUP); 1675 pop(rsp); // restore SP 1676 1677 } // string_indexof 1678 1679 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 1680 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 1681 ShortBranchVerifier sbv(this); 1682 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 1683 1684 int stride = 8; 1685 1686 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 1687 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 1688 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 1689 FOUND_SEQ_CHAR, DONE_LABEL; 1690 1691 movptr(result, str1); 1692 if (UseAVX >= 2) { 1693 cmpl(cnt1, stride); 1694 jcc(Assembler::less, SCAN_TO_CHAR); 1695 cmpl(cnt1, 2*stride); 1696 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 1697 movdl(vec1, ch); 1698 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 1699 vpxor(vec2, vec2); 1700 movl(tmp, cnt1); 1701 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 1702 andl(cnt1,0x0000000F); //tail count (in chars) 1703 1704 bind(SCAN_TO_16_CHAR_LOOP); 1705 vmovdqu(vec3, Address(result, 0)); 1706 vpcmpeqw(vec3, vec3, vec1, 1); 1707 vptest(vec2, vec3); 1708 jcc(Assembler::carryClear, FOUND_CHAR); 1709 addptr(result, 32); 1710 subl(tmp, 2*stride); 1711 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 1712 jmp(SCAN_TO_8_CHAR); 1713 bind(SCAN_TO_8_CHAR_INIT); 1714 movdl(vec1, ch); 1715 pshuflw(vec1, vec1, 0x00); 1716 pshufd(vec1, vec1, 0); 1717 pxor(vec2, vec2); 1718 } 1719 bind(SCAN_TO_8_CHAR); 1720 cmpl(cnt1, stride); 1721 jcc(Assembler::less, SCAN_TO_CHAR); 1722 if (UseAVX < 2) { 1723 movdl(vec1, ch); 1724 pshuflw(vec1, vec1, 0x00); 1725 pshufd(vec1, vec1, 0); 1726 pxor(vec2, vec2); 1727 } 1728 movl(tmp, cnt1); 1729 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 1730 andl(cnt1,0x00000007); //tail count (in chars) 1731 1732 bind(SCAN_TO_8_CHAR_LOOP); 1733 movdqu(vec3, Address(result, 0)); 1734 pcmpeqw(vec3, vec1); 1735 ptest(vec2, vec3); 1736 jcc(Assembler::carryClear, FOUND_CHAR); 1737 addptr(result, 16); 1738 subl(tmp, stride); 1739 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 1740 bind(SCAN_TO_CHAR); 1741 testl(cnt1, cnt1); 1742 jcc(Assembler::zero, RET_NOT_FOUND); 1743 bind(SCAN_TO_CHAR_LOOP); 1744 load_unsigned_short(tmp, Address(result, 0)); 1745 cmpl(ch, tmp); 1746 jccb(Assembler::equal, FOUND_SEQ_CHAR); 1747 addptr(result, 2); 1748 subl(cnt1, 1); 1749 jccb(Assembler::zero, RET_NOT_FOUND); 1750 jmp(SCAN_TO_CHAR_LOOP); 1751 1752 bind(RET_NOT_FOUND); 1753 movl(result, -1); 1754 jmpb(DONE_LABEL); 1755 1756 bind(FOUND_CHAR); 1757 if (UseAVX >= 2) { 1758 vpmovmskb(tmp, vec3); 1759 } else { 1760 pmovmskb(tmp, vec3); 1761 } 1762 bsfl(ch, tmp); 1763 addl(result, ch); 1764 1765 bind(FOUND_SEQ_CHAR); 1766 subptr(result, str1); 1767 shrl(result, 1); 1768 1769 bind(DONE_LABEL); 1770 } // string_indexof_char 1771 1772 // helper function for string_compare 1773 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 1774 Address::ScaleFactor scale, Address::ScaleFactor scale1, 1775 Address::ScaleFactor scale2, Register index, int ae) { 1776 if (ae == StrIntrinsicNode::LL) { 1777 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 1778 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 1779 } else if (ae == StrIntrinsicNode::UU) { 1780 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 1781 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 1782 } else { 1783 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 1784 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 1785 } 1786 } 1787 1788 // Compare strings, used for char[] and byte[]. 1789 void C2_MacroAssembler::string_compare(Register str1, Register str2, 1790 Register cnt1, Register cnt2, Register result, 1791 XMMRegister vec1, int ae) { 1792 ShortBranchVerifier sbv(this); 1793 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 1794 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 1795 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 1796 int stride2x2 = 0x40; 1797 Address::ScaleFactor scale = Address::no_scale; 1798 Address::ScaleFactor scale1 = Address::no_scale; 1799 Address::ScaleFactor scale2 = Address::no_scale; 1800 1801 if (ae != StrIntrinsicNode::LL) { 1802 stride2x2 = 0x20; 1803 } 1804 1805 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 1806 shrl(cnt2, 1); 1807 } 1808 // Compute the minimum of the string lengths and the 1809 // difference of the string lengths (stack). 1810 // Do the conditional move stuff 1811 movl(result, cnt1); 1812 subl(cnt1, cnt2); 1813 push(cnt1); 1814 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 1815 1816 // Is the minimum length zero? 1817 testl(cnt2, cnt2); 1818 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 1819 if (ae == StrIntrinsicNode::LL) { 1820 // Load first bytes 1821 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 1822 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 1823 } else if (ae == StrIntrinsicNode::UU) { 1824 // Load first characters 1825 load_unsigned_short(result, Address(str1, 0)); 1826 load_unsigned_short(cnt1, Address(str2, 0)); 1827 } else { 1828 load_unsigned_byte(result, Address(str1, 0)); 1829 load_unsigned_short(cnt1, Address(str2, 0)); 1830 } 1831 subl(result, cnt1); 1832 jcc(Assembler::notZero, POP_LABEL); 1833 1834 if (ae == StrIntrinsicNode::UU) { 1835 // Divide length by 2 to get number of chars 1836 shrl(cnt2, 1); 1837 } 1838 cmpl(cnt2, 1); 1839 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 1840 1841 // Check if the strings start at the same location and setup scale and stride 1842 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1843 cmpptr(str1, str2); 1844 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 1845 if (ae == StrIntrinsicNode::LL) { 1846 scale = Address::times_1; 1847 stride = 16; 1848 } else { 1849 scale = Address::times_2; 1850 stride = 8; 1851 } 1852 } else { 1853 scale1 = Address::times_1; 1854 scale2 = Address::times_2; 1855 // scale not used 1856 stride = 8; 1857 } 1858 1859 if (UseAVX >= 2 && UseSSE42Intrinsics) { 1860 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 1861 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 1862 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 1863 Label COMPARE_TAIL_LONG; 1864 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 1865 1866 int pcmpmask = 0x19; 1867 if (ae == StrIntrinsicNode::LL) { 1868 pcmpmask &= ~0x01; 1869 } 1870 1871 // Setup to compare 16-chars (32-bytes) vectors, 1872 // start from first character again because it has aligned address. 1873 if (ae == StrIntrinsicNode::LL) { 1874 stride2 = 32; 1875 } else { 1876 stride2 = 16; 1877 } 1878 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1879 adr_stride = stride << scale; 1880 } else { 1881 adr_stride1 = 8; //stride << scale1; 1882 adr_stride2 = 16; //stride << scale2; 1883 } 1884 1885 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 1886 // rax and rdx are used by pcmpestri as elements counters 1887 movl(result, cnt2); 1888 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 1889 jcc(Assembler::zero, COMPARE_TAIL_LONG); 1890 1891 // fast path : compare first 2 8-char vectors. 1892 bind(COMPARE_16_CHARS); 1893 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1894 movdqu(vec1, Address(str1, 0)); 1895 } else { 1896 pmovzxbw(vec1, Address(str1, 0)); 1897 } 1898 pcmpestri(vec1, Address(str2, 0), pcmpmask); 1899 jccb(Assembler::below, COMPARE_INDEX_CHAR); 1900 1901 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1902 movdqu(vec1, Address(str1, adr_stride)); 1903 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 1904 } else { 1905 pmovzxbw(vec1, Address(str1, adr_stride1)); 1906 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 1907 } 1908 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 1909 addl(cnt1, stride); 1910 1911 // Compare the characters at index in cnt1 1912 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 1913 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 1914 subl(result, cnt2); 1915 jmp(POP_LABEL); 1916 1917 // Setup the registers to start vector comparison loop 1918 bind(COMPARE_WIDE_VECTORS); 1919 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1920 lea(str1, Address(str1, result, scale)); 1921 lea(str2, Address(str2, result, scale)); 1922 } else { 1923 lea(str1, Address(str1, result, scale1)); 1924 lea(str2, Address(str2, result, scale2)); 1925 } 1926 subl(result, stride2); 1927 subl(cnt2, stride2); 1928 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 1929 negptr(result); 1930 1931 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 1932 bind(COMPARE_WIDE_VECTORS_LOOP); 1933 1934 #ifdef _LP64 1935 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 1936 cmpl(cnt2, stride2x2); 1937 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 1938 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 1939 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 1940 1941 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 1942 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1943 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 1944 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 1945 } else { 1946 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 1947 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 1948 } 1949 kortestql(k7, k7); 1950 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 1951 addptr(result, stride2x2); // update since we already compared at this addr 1952 subl(cnt2, stride2x2); // and sub the size too 1953 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 1954 1955 vpxor(vec1, vec1); 1956 jmpb(COMPARE_WIDE_TAIL); 1957 }//if (VM_Version::supports_avx512vlbw()) 1958 #endif // _LP64 1959 1960 1961 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 1962 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1963 vmovdqu(vec1, Address(str1, result, scale)); 1964 vpxor(vec1, Address(str2, result, scale)); 1965 } else { 1966 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 1967 vpxor(vec1, Address(str2, result, scale2)); 1968 } 1969 vptest(vec1, vec1); 1970 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 1971 addptr(result, stride2); 1972 subl(cnt2, stride2); 1973 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 1974 // clean upper bits of YMM registers 1975 vpxor(vec1, vec1); 1976 1977 // compare wide vectors tail 1978 bind(COMPARE_WIDE_TAIL); 1979 testptr(result, result); 1980 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 1981 1982 movl(result, stride2); 1983 movl(cnt2, result); 1984 negptr(result); 1985 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 1986 1987 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 1988 bind(VECTOR_NOT_EQUAL); 1989 // clean upper bits of YMM registers 1990 vpxor(vec1, vec1); 1991 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 1992 lea(str1, Address(str1, result, scale)); 1993 lea(str2, Address(str2, result, scale)); 1994 } else { 1995 lea(str1, Address(str1, result, scale1)); 1996 lea(str2, Address(str2, result, scale2)); 1997 } 1998 jmp(COMPARE_16_CHARS); 1999 2000 // Compare tail chars, length between 1 to 15 chars 2001 bind(COMPARE_TAIL_LONG); 2002 movl(cnt2, result); 2003 cmpl(cnt2, stride); 2004 jcc(Assembler::less, COMPARE_SMALL_STR); 2005 2006 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2007 movdqu(vec1, Address(str1, 0)); 2008 } else { 2009 pmovzxbw(vec1, Address(str1, 0)); 2010 } 2011 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2012 jcc(Assembler::below, COMPARE_INDEX_CHAR); 2013 subptr(cnt2, stride); 2014 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2015 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2016 lea(str1, Address(str1, result, scale)); 2017 lea(str2, Address(str2, result, scale)); 2018 } else { 2019 lea(str1, Address(str1, result, scale1)); 2020 lea(str2, Address(str2, result, scale2)); 2021 } 2022 negptr(cnt2); 2023 jmpb(WHILE_HEAD_LABEL); 2024 2025 bind(COMPARE_SMALL_STR); 2026 } else if (UseSSE42Intrinsics) { 2027 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 2028 int pcmpmask = 0x19; 2029 // Setup to compare 8-char (16-byte) vectors, 2030 // start from first character again because it has aligned address. 2031 movl(result, cnt2); 2032 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 2033 if (ae == StrIntrinsicNode::LL) { 2034 pcmpmask &= ~0x01; 2035 } 2036 jcc(Assembler::zero, COMPARE_TAIL); 2037 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2038 lea(str1, Address(str1, result, scale)); 2039 lea(str2, Address(str2, result, scale)); 2040 } else { 2041 lea(str1, Address(str1, result, scale1)); 2042 lea(str2, Address(str2, result, scale2)); 2043 } 2044 negptr(result); 2045 2046 // pcmpestri 2047 // inputs: 2048 // vec1- substring 2049 // rax - negative string length (elements count) 2050 // mem - scanned string 2051 // rdx - string length (elements count) 2052 // pcmpmask - cmp mode: 11000 (string compare with negated result) 2053 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2054 // outputs: 2055 // rcx - first mismatched element index 2056 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2057 2058 bind(COMPARE_WIDE_VECTORS); 2059 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2060 movdqu(vec1, Address(str1, result, scale)); 2061 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2062 } else { 2063 pmovzxbw(vec1, Address(str1, result, scale1)); 2064 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2065 } 2066 // After pcmpestri cnt1(rcx) contains mismatched element index 2067 2068 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 2069 addptr(result, stride); 2070 subptr(cnt2, stride); 2071 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 2072 2073 // compare wide vectors tail 2074 testptr(result, result); 2075 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2076 2077 movl(cnt2, stride); 2078 movl(result, stride); 2079 negptr(result); 2080 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2081 movdqu(vec1, Address(str1, result, scale)); 2082 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2083 } else { 2084 pmovzxbw(vec1, Address(str1, result, scale1)); 2085 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2086 } 2087 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 2088 2089 // Mismatched characters in the vectors 2090 bind(VECTOR_NOT_EQUAL); 2091 addptr(cnt1, result); 2092 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2093 subl(result, cnt2); 2094 jmpb(POP_LABEL); 2095 2096 bind(COMPARE_TAIL); // limit is zero 2097 movl(cnt2, result); 2098 // Fallthru to tail compare 2099 } 2100 // Shift str2 and str1 to the end of the arrays, negate min 2101 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2102 lea(str1, Address(str1, cnt2, scale)); 2103 lea(str2, Address(str2, cnt2, scale)); 2104 } else { 2105 lea(str1, Address(str1, cnt2, scale1)); 2106 lea(str2, Address(str2, cnt2, scale2)); 2107 } 2108 decrementl(cnt2); // first character was compared already 2109 negptr(cnt2); 2110 2111 // Compare the rest of the elements 2112 bind(WHILE_HEAD_LABEL); 2113 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 2114 subl(result, cnt1); 2115 jccb(Assembler::notZero, POP_LABEL); 2116 increment(cnt2); 2117 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 2118 2119 // Strings are equal up to min length. Return the length difference. 2120 bind(LENGTH_DIFF_LABEL); 2121 pop(result); 2122 if (ae == StrIntrinsicNode::UU) { 2123 // Divide diff by 2 to get number of chars 2124 sarl(result, 1); 2125 } 2126 jmpb(DONE_LABEL); 2127 2128 #ifdef _LP64 2129 if (VM_Version::supports_avx512vlbw()) { 2130 2131 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 2132 2133 kmovql(cnt1, k7); 2134 notq(cnt1); 2135 bsfq(cnt2, cnt1); 2136 if (ae != StrIntrinsicNode::LL) { 2137 // Divide diff by 2 to get number of chars 2138 sarl(cnt2, 1); 2139 } 2140 addq(result, cnt2); 2141 if (ae == StrIntrinsicNode::LL) { 2142 load_unsigned_byte(cnt1, Address(str2, result)); 2143 load_unsigned_byte(result, Address(str1, result)); 2144 } else if (ae == StrIntrinsicNode::UU) { 2145 load_unsigned_short(cnt1, Address(str2, result, scale)); 2146 load_unsigned_short(result, Address(str1, result, scale)); 2147 } else { 2148 load_unsigned_short(cnt1, Address(str2, result, scale2)); 2149 load_unsigned_byte(result, Address(str1, result, scale1)); 2150 } 2151 subl(result, cnt1); 2152 jmpb(POP_LABEL); 2153 }//if (VM_Version::supports_avx512vlbw()) 2154 #endif // _LP64 2155 2156 // Discard the stored length difference 2157 bind(POP_LABEL); 2158 pop(cnt1); 2159 2160 // That's it 2161 bind(DONE_LABEL); 2162 if(ae == StrIntrinsicNode::UL) { 2163 negl(result); 2164 } 2165 2166 } 2167 2168 // Search for Non-ASCII character (Negative byte value) in a byte array, 2169 // return true if it has any and false otherwise. 2170 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 2171 // @HotSpotIntrinsicCandidate 2172 // private static boolean hasNegatives(byte[] ba, int off, int len) { 2173 // for (int i = off; i < off + len; i++) { 2174 // if (ba[i] < 0) { 2175 // return true; 2176 // } 2177 // } 2178 // return false; 2179 // } 2180 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 2181 Register result, Register tmp1, 2182 XMMRegister vec1, XMMRegister vec2) { 2183 // rsi: byte array 2184 // rcx: len 2185 // rax: result 2186 ShortBranchVerifier sbv(this); 2187 assert_different_registers(ary1, len, result, tmp1); 2188 assert_different_registers(vec1, vec2); 2189 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 2190 2191 // len == 0 2192 testl(len, len); 2193 jcc(Assembler::zero, FALSE_LABEL); 2194 2195 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 2196 VM_Version::supports_avx512vlbw() && 2197 VM_Version::supports_bmi2()) { 2198 2199 Label test_64_loop, test_tail; 2200 Register tmp3_aliased = len; 2201 2202 movl(tmp1, len); 2203 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 2204 2205 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 2206 andl(len, ~(64 - 1)); // vector count (in chars) 2207 jccb(Assembler::zero, test_tail); 2208 2209 lea(ary1, Address(ary1, len, Address::times_1)); 2210 negptr(len); 2211 2212 bind(test_64_loop); 2213 // Check whether our 64 elements of size byte contain negatives 2214 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 2215 kortestql(k2, k2); 2216 jcc(Assembler::notZero, TRUE_LABEL); 2217 2218 addptr(len, 64); 2219 jccb(Assembler::notZero, test_64_loop); 2220 2221 2222 bind(test_tail); 2223 // bail out when there is nothing to be done 2224 testl(tmp1, -1); 2225 jcc(Assembler::zero, FALSE_LABEL); 2226 2227 // ~(~0 << len) applied up to two times (for 32-bit scenario) 2228 #ifdef _LP64 2229 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 2230 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 2231 notq(tmp3_aliased); 2232 kmovql(k3, tmp3_aliased); 2233 #else 2234 Label k_init; 2235 jmp(k_init); 2236 2237 // We could not read 64-bits from a general purpose register thus we move 2238 // data required to compose 64 1's to the instruction stream 2239 // We emit 64 byte wide series of elements from 0..63 which later on would 2240 // be used as a compare targets with tail count contained in tmp1 register. 2241 // Result would be a k register having tmp1 consecutive number or 1 2242 // counting from least significant bit. 2243 address tmp = pc(); 2244 emit_int64(0x0706050403020100); 2245 emit_int64(0x0F0E0D0C0B0A0908); 2246 emit_int64(0x1716151413121110); 2247 emit_int64(0x1F1E1D1C1B1A1918); 2248 emit_int64(0x2726252423222120); 2249 emit_int64(0x2F2E2D2C2B2A2928); 2250 emit_int64(0x3736353433323130); 2251 emit_int64(0x3F3E3D3C3B3A3938); 2252 2253 bind(k_init); 2254 lea(len, InternalAddress(tmp)); 2255 // create mask to test for negative byte inside a vector 2256 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 2257 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); 2258 2259 #endif 2260 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); 2261 ktestq(k2, k3); 2262 jcc(Assembler::notZero, TRUE_LABEL); 2263 2264 jmp(FALSE_LABEL); 2265 } else { 2266 movl(result, len); // copy 2267 2268 if (UseAVX >= 2 && UseSSE >= 2) { 2269 // With AVX2, use 32-byte vector compare 2270 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2271 2272 // Compare 32-byte vectors 2273 andl(result, 0x0000001f); // tail count (in bytes) 2274 andl(len, 0xffffffe0); // vector count (in bytes) 2275 jccb(Assembler::zero, COMPARE_TAIL); 2276 2277 lea(ary1, Address(ary1, len, Address::times_1)); 2278 negptr(len); 2279 2280 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 2281 movdl(vec2, tmp1); 2282 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 2283 2284 bind(COMPARE_WIDE_VECTORS); 2285 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 2286 vptest(vec1, vec2); 2287 jccb(Assembler::notZero, TRUE_LABEL); 2288 addptr(len, 32); 2289 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2290 2291 testl(result, result); 2292 jccb(Assembler::zero, FALSE_LABEL); 2293 2294 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 2295 vptest(vec1, vec2); 2296 jccb(Assembler::notZero, TRUE_LABEL); 2297 jmpb(FALSE_LABEL); 2298 2299 bind(COMPARE_TAIL); // len is zero 2300 movl(len, result); 2301 // Fallthru to tail compare 2302 } else if (UseSSE42Intrinsics) { 2303 // With SSE4.2, use double quad vector compare 2304 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2305 2306 // Compare 16-byte vectors 2307 andl(result, 0x0000000f); // tail count (in bytes) 2308 andl(len, 0xfffffff0); // vector count (in bytes) 2309 jcc(Assembler::zero, COMPARE_TAIL); 2310 2311 lea(ary1, Address(ary1, len, Address::times_1)); 2312 negptr(len); 2313 2314 movl(tmp1, 0x80808080); 2315 movdl(vec2, tmp1); 2316 pshufd(vec2, vec2, 0); 2317 2318 bind(COMPARE_WIDE_VECTORS); 2319 movdqu(vec1, Address(ary1, len, Address::times_1)); 2320 ptest(vec1, vec2); 2321 jcc(Assembler::notZero, TRUE_LABEL); 2322 addptr(len, 16); 2323 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2324 2325 testl(result, result); 2326 jcc(Assembler::zero, FALSE_LABEL); 2327 2328 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 2329 ptest(vec1, vec2); 2330 jccb(Assembler::notZero, TRUE_LABEL); 2331 jmpb(FALSE_LABEL); 2332 2333 bind(COMPARE_TAIL); // len is zero 2334 movl(len, result); 2335 // Fallthru to tail compare 2336 } 2337 } 2338 // Compare 4-byte vectors 2339 andl(len, 0xfffffffc); // vector count (in bytes) 2340 jccb(Assembler::zero, COMPARE_CHAR); 2341 2342 lea(ary1, Address(ary1, len, Address::times_1)); 2343 negptr(len); 2344 2345 bind(COMPARE_VECTORS); 2346 movl(tmp1, Address(ary1, len, Address::times_1)); 2347 andl(tmp1, 0x80808080); 2348 jccb(Assembler::notZero, TRUE_LABEL); 2349 addptr(len, 4); 2350 jcc(Assembler::notZero, COMPARE_VECTORS); 2351 2352 // Compare trailing char (final 2 bytes), if any 2353 bind(COMPARE_CHAR); 2354 testl(result, 0x2); // tail char 2355 jccb(Assembler::zero, COMPARE_BYTE); 2356 load_unsigned_short(tmp1, Address(ary1, 0)); 2357 andl(tmp1, 0x00008080); 2358 jccb(Assembler::notZero, TRUE_LABEL); 2359 subptr(result, 2); 2360 lea(ary1, Address(ary1, 2)); 2361 2362 bind(COMPARE_BYTE); 2363 testl(result, 0x1); // tail byte 2364 jccb(Assembler::zero, FALSE_LABEL); 2365 load_unsigned_byte(tmp1, Address(ary1, 0)); 2366 andl(tmp1, 0x00000080); 2367 jccb(Assembler::notEqual, TRUE_LABEL); 2368 jmpb(FALSE_LABEL); 2369 2370 bind(TRUE_LABEL); 2371 movl(result, 1); // return true 2372 jmpb(DONE); 2373 2374 bind(FALSE_LABEL); 2375 xorl(result, result); // return false 2376 2377 // That's it 2378 bind(DONE); 2379 if (UseAVX >= 2 && UseSSE >= 2) { 2380 // clean upper bits of YMM registers 2381 vpxor(vec1, vec1); 2382 vpxor(vec2, vec2); 2383 } 2384 } 2385 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 2386 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 2387 Register limit, Register result, Register chr, 2388 XMMRegister vec1, XMMRegister vec2, bool is_char) { 2389 ShortBranchVerifier sbv(this); 2390 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 2391 2392 int length_offset = arrayOopDesc::length_offset_in_bytes(); 2393 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 2394 2395 if (is_array_equ) { 2396 // Check the input args 2397 cmpoop(ary1, ary2); 2398 jcc(Assembler::equal, TRUE_LABEL); 2399 2400 // Need additional checks for arrays_equals. 2401 testptr(ary1, ary1); 2402 jcc(Assembler::zero, FALSE_LABEL); 2403 testptr(ary2, ary2); 2404 jcc(Assembler::zero, FALSE_LABEL); 2405 2406 // Check the lengths 2407 movl(limit, Address(ary1, length_offset)); 2408 cmpl(limit, Address(ary2, length_offset)); 2409 jcc(Assembler::notEqual, FALSE_LABEL); 2410 } 2411 2412 // count == 0 2413 testl(limit, limit); 2414 jcc(Assembler::zero, TRUE_LABEL); 2415 2416 if (is_array_equ) { 2417 // Load array address 2418 lea(ary1, Address(ary1, base_offset)); 2419 lea(ary2, Address(ary2, base_offset)); 2420 } 2421 2422 if (is_array_equ && is_char) { 2423 // arrays_equals when used for char[]. 2424 shll(limit, 1); // byte count != 0 2425 } 2426 movl(result, limit); // copy 2427 2428 if (UseAVX >= 2) { 2429 // With AVX2, use 32-byte vector compare 2430 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2431 2432 // Compare 32-byte vectors 2433 andl(result, 0x0000001f); // tail count (in bytes) 2434 andl(limit, 0xffffffe0); // vector count (in bytes) 2435 jcc(Assembler::zero, COMPARE_TAIL); 2436 2437 lea(ary1, Address(ary1, limit, Address::times_1)); 2438 lea(ary2, Address(ary2, limit, Address::times_1)); 2439 negptr(limit); 2440 2441 #ifdef _LP64 2442 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 2443 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 2444 2445 cmpl(limit, -64); 2446 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 2447 2448 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 2449 2450 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 2451 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 2452 kortestql(k7, k7); 2453 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 2454 addptr(limit, 64); // update since we already compared at this addr 2455 cmpl(limit, -64); 2456 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 2457 2458 // At this point we may still need to compare -limit+result bytes. 2459 // We could execute the next two instruction and just continue via non-wide path: 2460 // cmpl(limit, 0); 2461 // jcc(Assembler::equal, COMPARE_TAIL); // true 2462 // But since we stopped at the points ary{1,2}+limit which are 2463 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 2464 // (|limit| <= 32 and result < 32), 2465 // we may just compare the last 64 bytes. 2466 // 2467 addptr(result, -64); // it is safe, bc we just came from this area 2468 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 2469 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 2470 kortestql(k7, k7); 2471 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 2472 2473 jmp(TRUE_LABEL); 2474 2475 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2476 2477 }//if (VM_Version::supports_avx512vlbw()) 2478 #endif //_LP64 2479 bind(COMPARE_WIDE_VECTORS); 2480 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 2481 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 2482 vpxor(vec1, vec2); 2483 2484 vptest(vec1, vec1); 2485 jcc(Assembler::notZero, FALSE_LABEL); 2486 addptr(limit, 32); 2487 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2488 2489 testl(result, result); 2490 jcc(Assembler::zero, TRUE_LABEL); 2491 2492 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 2493 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 2494 vpxor(vec1, vec2); 2495 2496 vptest(vec1, vec1); 2497 jccb(Assembler::notZero, FALSE_LABEL); 2498 jmpb(TRUE_LABEL); 2499 2500 bind(COMPARE_TAIL); // limit is zero 2501 movl(limit, result); 2502 // Fallthru to tail compare 2503 } else if (UseSSE42Intrinsics) { 2504 // With SSE4.2, use double quad vector compare 2505 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 2506 2507 // Compare 16-byte vectors 2508 andl(result, 0x0000000f); // tail count (in bytes) 2509 andl(limit, 0xfffffff0); // vector count (in bytes) 2510 jcc(Assembler::zero, COMPARE_TAIL); 2511 2512 lea(ary1, Address(ary1, limit, Address::times_1)); 2513 lea(ary2, Address(ary2, limit, Address::times_1)); 2514 negptr(limit); 2515 2516 bind(COMPARE_WIDE_VECTORS); 2517 movdqu(vec1, Address(ary1, limit, Address::times_1)); 2518 movdqu(vec2, Address(ary2, limit, Address::times_1)); 2519 pxor(vec1, vec2); 2520 2521 ptest(vec1, vec1); 2522 jcc(Assembler::notZero, FALSE_LABEL); 2523 addptr(limit, 16); 2524 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 2525 2526 testl(result, result); 2527 jcc(Assembler::zero, TRUE_LABEL); 2528 2529 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 2530 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 2531 pxor(vec1, vec2); 2532 2533 ptest(vec1, vec1); 2534 jccb(Assembler::notZero, FALSE_LABEL); 2535 jmpb(TRUE_LABEL); 2536 2537 bind(COMPARE_TAIL); // limit is zero 2538 movl(limit, result); 2539 // Fallthru to tail compare 2540 } 2541 2542 // Compare 4-byte vectors 2543 andl(limit, 0xfffffffc); // vector count (in bytes) 2544 jccb(Assembler::zero, COMPARE_CHAR); 2545 2546 lea(ary1, Address(ary1, limit, Address::times_1)); 2547 lea(ary2, Address(ary2, limit, Address::times_1)); 2548 negptr(limit); 2549 2550 bind(COMPARE_VECTORS); 2551 movl(chr, Address(ary1, limit, Address::times_1)); 2552 cmpl(chr, Address(ary2, limit, Address::times_1)); 2553 jccb(Assembler::notEqual, FALSE_LABEL); 2554 addptr(limit, 4); 2555 jcc(Assembler::notZero, COMPARE_VECTORS); 2556 2557 // Compare trailing char (final 2 bytes), if any 2558 bind(COMPARE_CHAR); 2559 testl(result, 0x2); // tail char 2560 jccb(Assembler::zero, COMPARE_BYTE); 2561 load_unsigned_short(chr, Address(ary1, 0)); 2562 load_unsigned_short(limit, Address(ary2, 0)); 2563 cmpl(chr, limit); 2564 jccb(Assembler::notEqual, FALSE_LABEL); 2565 2566 if (is_array_equ && is_char) { 2567 bind(COMPARE_BYTE); 2568 } else { 2569 lea(ary1, Address(ary1, 2)); 2570 lea(ary2, Address(ary2, 2)); 2571 2572 bind(COMPARE_BYTE); 2573 testl(result, 0x1); // tail byte 2574 jccb(Assembler::zero, TRUE_LABEL); 2575 load_unsigned_byte(chr, Address(ary1, 0)); 2576 load_unsigned_byte(limit, Address(ary2, 0)); 2577 cmpl(chr, limit); 2578 jccb(Assembler::notEqual, FALSE_LABEL); 2579 } 2580 bind(TRUE_LABEL); 2581 movl(result, 1); // return true 2582 jmpb(DONE); 2583 2584 bind(FALSE_LABEL); 2585 xorl(result, result); // return false 2586 2587 // That's it 2588 bind(DONE); 2589 if (UseAVX >= 2) { 2590 // clean upper bits of YMM registers 2591 vpxor(vec1, vec1); 2592 vpxor(vec2, vec2); 2593 } 2594 }