1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "runtime/biasedLocking.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 37 switch (vlen_in_bytes) { 38 case 4: // fall-through 39 case 8: // fall-through 40 case 16: return Assembler::AVX_128bit; 41 case 32: return Assembler::AVX_256bit; 42 case 64: return Assembler::AVX_512bit; 43 44 default: { 45 ShouldNotReachHere(); 46 return Assembler::AVX_NoVec; 47 } 48 } 49 } 50 51 void C2_MacroAssembler::setvectmask(Register dst, Register src) { 52 guarantee(PostLoopMultiversioning, "must be"); 53 Assembler::movl(dst, 1); 54 Assembler::shlxl(dst, dst, src); 55 Assembler::decl(dst); 56 Assembler::kmovdl(k1, dst); 57 Assembler::movl(dst, src); 58 } 59 60 void C2_MacroAssembler::restorevectmask() { 61 guarantee(PostLoopMultiversioning, "must be"); 62 Assembler::knotwl(k1, k0); 63 } 64 65 #if INCLUDE_RTM_OPT 66 67 // Update rtm_counters based on abort status 68 // input: abort_status 69 // rtm_counters (RTMLockingCounters*) 70 // flags are killed 71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 72 73 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 74 if (PrintPreciseRTMLockingStatistics) { 75 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 76 Label check_abort; 77 testl(abort_status, (1<<i)); 78 jccb(Assembler::equal, check_abort); 79 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 80 bind(check_abort); 81 } 82 } 83 } 84 85 // Branch if (random & (count-1) != 0), count is 2^n 86 // tmp, scr and flags are killed 87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 88 assert(tmp == rax, ""); 89 assert(scr == rdx, ""); 90 rdtsc(); // modifies EDX:EAX 91 andptr(tmp, count-1); 92 jccb(Assembler::notZero, brLabel); 93 } 94 95 // Perform abort ratio calculation, set no_rtm bit if high ratio 96 // input: rtm_counters_Reg (RTMLockingCounters* address) 97 // tmpReg, rtm_counters_Reg and flags are killed 98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 99 Register rtm_counters_Reg, 100 RTMLockingCounters* rtm_counters, 101 Metadata* method_data) { 102 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 103 104 if (RTMLockingCalculationDelay > 0) { 105 // Delay calculation 106 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 107 testptr(tmpReg, tmpReg); 108 jccb(Assembler::equal, L_done); 109 } 110 // Abort ratio calculation only if abort_count > RTMAbortThreshold 111 // Aborted transactions = abort_count * 100 112 // All transactions = total_count * RTMTotalCountIncrRate 113 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 114 115 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 116 cmpptr(tmpReg, RTMAbortThreshold); 117 jccb(Assembler::below, L_check_always_rtm2); 118 imulptr(tmpReg, tmpReg, 100); 119 120 Register scrReg = rtm_counters_Reg; 121 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 122 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 123 imulptr(scrReg, scrReg, RTMAbortRatio); 124 cmpptr(tmpReg, scrReg); 125 jccb(Assembler::below, L_check_always_rtm1); 126 if (method_data != NULL) { 127 // set rtm_state to "no rtm" in MDO 128 mov_metadata(tmpReg, method_data); 129 lock(); 130 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 131 } 132 jmpb(L_done); 133 bind(L_check_always_rtm1); 134 // Reload RTMLockingCounters* address 135 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 136 bind(L_check_always_rtm2); 137 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 138 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 139 jccb(Assembler::below, L_done); 140 if (method_data != NULL) { 141 // set rtm_state to "always rtm" in MDO 142 mov_metadata(tmpReg, method_data); 143 lock(); 144 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 145 } 146 bind(L_done); 147 } 148 149 // Update counters and perform abort ratio calculation 150 // input: abort_status_Reg 151 // rtm_counters_Reg, flags are killed 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 153 Register rtm_counters_Reg, 154 RTMLockingCounters* rtm_counters, 155 Metadata* method_data, 156 bool profile_rtm) { 157 158 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 159 // update rtm counters based on rax value at abort 160 // reads abort_status_Reg, updates flags 161 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 162 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 163 if (profile_rtm) { 164 // Save abort status because abort_status_Reg is used by following code. 165 if (RTMRetryCount > 0) { 166 push(abort_status_Reg); 167 } 168 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 169 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 170 // restore abort status 171 if (RTMRetryCount > 0) { 172 pop(abort_status_Reg); 173 } 174 } 175 } 176 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 178 // inputs: retry_count_Reg 179 // : abort_status_Reg 180 // output: retry_count_Reg decremented by 1 181 // flags are killed 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 183 Label doneRetry; 184 assert(abort_status_Reg == rax, ""); 185 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 186 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 187 // if reason is in 0x6 and retry count != 0 then retry 188 andptr(abort_status_Reg, 0x6); 189 jccb(Assembler::zero, doneRetry); 190 testl(retry_count_Reg, retry_count_Reg); 191 jccb(Assembler::zero, doneRetry); 192 pause(); 193 decrementl(retry_count_Reg); 194 jmp(retryLabel); 195 bind(doneRetry); 196 } 197 198 // Spin and retry if lock is busy, 199 // inputs: box_Reg (monitor address) 200 // : retry_count_Reg 201 // output: retry_count_Reg decremented by 1 202 // : clear z flag if retry count exceeded 203 // tmp_Reg, scr_Reg, flags are killed 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 205 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 206 Label SpinLoop, SpinExit, doneRetry; 207 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 208 209 testl(retry_count_Reg, retry_count_Reg); 210 jccb(Assembler::zero, doneRetry); 211 decrementl(retry_count_Reg); 212 movptr(scr_Reg, RTMSpinLoopCount); 213 214 bind(SpinLoop); 215 pause(); 216 decrementl(scr_Reg); 217 jccb(Assembler::lessEqual, SpinExit); 218 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 219 testptr(tmp_Reg, tmp_Reg); 220 jccb(Assembler::notZero, SpinLoop); 221 222 bind(SpinExit); 223 jmp(retryLabel); 224 bind(doneRetry); 225 incrementl(retry_count_Reg); // clear z flag 226 } 227 228 // Use RTM for normal stack locks 229 // Input: objReg (object to lock) 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 231 Register retry_on_abort_count_Reg, 232 RTMLockingCounters* stack_rtm_counters, 233 Metadata* method_data, bool profile_rtm, 234 Label& DONE_LABEL, Label& IsInflated) { 235 assert(UseRTMForStackLocks, "why call this otherwise?"); 236 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 237 assert(tmpReg == rax, ""); 238 assert(scrReg == rdx, ""); 239 Label L_rtm_retry, L_decrement_retry, L_on_abort; 240 241 if (RTMRetryCount > 0) { 242 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 243 bind(L_rtm_retry); 244 } 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 246 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 247 jcc(Assembler::notZero, IsInflated); 248 249 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 250 Label L_noincrement; 251 if (RTMTotalCountIncrRate > 1) { 252 // tmpReg, scrReg and flags are killed 253 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 254 } 255 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 256 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 257 bind(L_noincrement); 258 } 259 xbegin(L_on_abort); 260 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 261 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 262 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 263 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 264 265 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 266 if (UseRTMXendForLockBusy) { 267 xend(); 268 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 269 jmp(L_decrement_retry); 270 } 271 else { 272 xabort(0); 273 } 274 bind(L_on_abort); 275 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 276 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 277 } 278 bind(L_decrement_retry); 279 if (RTMRetryCount > 0) { 280 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 281 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 282 } 283 } 284 285 // Use RTM for inflating locks 286 // inputs: objReg (object to lock) 287 // boxReg (on-stack box address (displaced header location) - KILLED) 288 // tmpReg (ObjectMonitor address + markWord::monitor_value) 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 290 Register scrReg, Register retry_on_busy_count_Reg, 291 Register retry_on_abort_count_Reg, 292 RTMLockingCounters* rtm_counters, 293 Metadata* method_data, bool profile_rtm, 294 Label& DONE_LABEL) { 295 assert(UseRTMLocking, "why call this otherwise?"); 296 assert(tmpReg == rax, ""); 297 assert(scrReg == rdx, ""); 298 Label L_rtm_retry, L_decrement_retry, L_on_abort; 299 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 300 301 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 302 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 303 movptr(boxReg, tmpReg); // Save ObjectMonitor address 304 305 if (RTMRetryCount > 0) { 306 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 307 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 308 bind(L_rtm_retry); 309 } 310 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 311 Label L_noincrement; 312 if (RTMTotalCountIncrRate > 1) { 313 // tmpReg, scrReg and flags are killed 314 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 315 } 316 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 317 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 318 bind(L_noincrement); 319 } 320 xbegin(L_on_abort); 321 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 322 movptr(tmpReg, Address(tmpReg, owner_offset)); 323 testptr(tmpReg, tmpReg); 324 jcc(Assembler::zero, DONE_LABEL); 325 if (UseRTMXendForLockBusy) { 326 xend(); 327 jmp(L_decrement_retry); 328 } 329 else { 330 xabort(0); 331 } 332 bind(L_on_abort); 333 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 334 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 335 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 336 } 337 if (RTMRetryCount > 0) { 338 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 339 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 340 } 341 342 movptr(tmpReg, Address(boxReg, owner_offset)) ; 343 testptr(tmpReg, tmpReg) ; 344 jccb(Assembler::notZero, L_decrement_retry) ; 345 346 // Appears unlocked - try to swing _owner from null to non-null. 347 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 348 #ifdef _LP64 349 Register threadReg = r15_thread; 350 #else 351 get_thread(scrReg); 352 Register threadReg = scrReg; 353 #endif 354 lock(); 355 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 356 357 if (RTMRetryCount > 0) { 358 // success done else retry 359 jccb(Assembler::equal, DONE_LABEL) ; 360 bind(L_decrement_retry); 361 // Spin and retry if lock is busy. 362 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 363 } 364 else { 365 bind(L_decrement_retry); 366 } 367 } 368 369 #endif // INCLUDE_RTM_OPT 370 371 // fast_lock and fast_unlock used by C2 372 373 // Because the transitions from emitted code to the runtime 374 // monitorenter/exit helper stubs are so slow it's critical that 375 // we inline both the stack-locking fast path and the inflated fast path. 376 // 377 // See also: cmpFastLock and cmpFastUnlock. 378 // 379 // What follows is a specialized inline transliteration of the code 380 // in enter() and exit(). If we're concerned about I$ bloat another 381 // option would be to emit TrySlowEnter and TrySlowExit methods 382 // at startup-time. These methods would accept arguments as 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 384 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 386 // In practice, however, the # of lock sites is bounded and is usually small. 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 388 // if the processor uses simple bimodal branch predictors keyed by EIP 389 // Since the helper routines would be called from multiple synchronization 390 // sites. 391 // 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 394 // to those specialized methods. That'd give us a mostly platform-independent 395 // implementation that the JITs could optimize and inline at their pleasure. 396 // Done correctly, the only time we'd need to cross to native could would be 397 // to park() or unpark() threads. We'd also need a few more unsafe operators 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 399 // (b) explicit barriers or fence operations. 400 // 401 // TODO: 402 // 403 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 404 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 405 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 406 // the lock operators would typically be faster than reifying Self. 407 // 408 // * Ideally I'd define the primitives as: 409 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 410 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 411 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 412 // Instead, we're stuck with a rather awkward and brittle register assignments below. 413 // Furthermore the register assignments are overconstrained, possibly resulting in 414 // sub-optimal code near the synchronization site. 415 // 416 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 417 // Alternately, use a better sp-proximity test. 418 // 419 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 420 // Either one is sufficient to uniquely identify a thread. 421 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 422 // 423 // * Intrinsify notify() and notifyAll() for the common cases where the 424 // object is locked by the calling thread but the waitlist is empty. 425 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 426 // 427 // * use jccb and jmpb instead of jcc and jmp to improve code density. 428 // But beware of excessive branch density on AMD Opterons. 429 // 430 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 431 // or failure of the fast path. If the fast path fails then we pass 432 // control to the slow path, typically in C. In fast_lock and 433 // fast_unlock we often branch to DONE_LABEL, just to find that C2 434 // will emit a conditional branch immediately after the node. 435 // So we have branches to branches and lots of ICC.ZF games. 436 // Instead, it might be better to have C2 pass a "FailureLabel" 437 // into fast_lock and fast_unlock. In the case of success, control 438 // will drop through the node. ICC.ZF is undefined at exit. 439 // In the case of failure, the node will branch directly to the 440 // FailureLabel 441 442 443 // obj: object to lock 444 // box: on-stack box address (displaced header location) - KILLED 445 // rax,: tmp -- KILLED 446 // scr: tmp -- KILLED 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 448 Register scrReg, Register cx1Reg, Register cx2Reg, 449 BiasedLockingCounters* counters, 450 RTMLockingCounters* rtm_counters, 451 RTMLockingCounters* stack_rtm_counters, 452 Metadata* method_data, 453 bool use_rtm, bool profile_rtm) { 454 // Ensure the register assignments are disjoint 455 assert(tmpReg == rax, ""); 456 457 if (use_rtm) { 458 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 459 } else { 460 assert(cx2Reg == noreg, ""); 461 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 462 } 463 464 if (counters != NULL) { 465 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 466 } 467 468 // Possible cases that we'll encounter in fast_lock 469 // ------------------------------------------------ 470 // * Inflated 471 // -- unlocked 472 // -- Locked 473 // = by self 474 // = by other 475 // * biased 476 // -- by Self 477 // -- by other 478 // * neutral 479 // * stack-locked 480 // -- by self 481 // = sp-proximity test hits 482 // = sp-proximity test generates false-negative 483 // -- by other 484 // 485 486 Label IsInflated, DONE_LABEL; 487 488 // it's stack-locked, biased or neutral 489 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 490 // order to reduce the number of conditional branches in the most common cases. 491 // Beware -- there's a subtle invariant that fetch of the markword 492 // at [FETCH], below, will never observe a biased encoding (*101b). 493 // If this invariant is not held we risk exclusion (safety) failure. 494 if (UseBiasedLocking && !UseOptoBiasInlining) { 495 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 496 } 497 498 #if INCLUDE_RTM_OPT 499 if (UseRTMForStackLocks && use_rtm) { 500 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 501 stack_rtm_counters, method_data, profile_rtm, 502 DONE_LABEL, IsInflated); 503 } 504 #endif // INCLUDE_RTM_OPT 505 506 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 507 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 508 jccb(Assembler::notZero, IsInflated); 509 510 // Attempt stack-locking ... 511 orptr (tmpReg, markWord::unlocked_value); 512 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 513 lock(); 514 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 515 if (counters != NULL) { 516 cond_inc32(Assembler::equal, 517 ExternalAddress((address)counters->fast_path_entry_count_addr())); 518 } 519 jcc(Assembler::equal, DONE_LABEL); // Success 520 521 // Recursive locking. 522 // The object is stack-locked: markword contains stack pointer to BasicLock. 523 // Locked by current thread if difference with current SP is less than one page. 524 subptr(tmpReg, rsp); 525 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 526 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 527 movptr(Address(boxReg, 0), tmpReg); 528 if (counters != NULL) { 529 cond_inc32(Assembler::equal, 530 ExternalAddress((address)counters->fast_path_entry_count_addr())); 531 } 532 jmp(DONE_LABEL); 533 534 bind(IsInflated); 535 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 536 537 #if INCLUDE_RTM_OPT 538 // Use the same RTM locking code in 32- and 64-bit VM. 539 if (use_rtm) { 540 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 541 rtm_counters, method_data, profile_rtm, DONE_LABEL); 542 } else { 543 #endif // INCLUDE_RTM_OPT 544 545 #ifndef _LP64 546 // The object is inflated. 547 548 // boxReg refers to the on-stack BasicLock in the current frame. 549 // We'd like to write: 550 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 551 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 552 // additional latency as we have another ST in the store buffer that must drain. 553 554 // avoid ST-before-CAS 555 // register juggle because we need tmpReg for cmpxchgptr below 556 movptr(scrReg, boxReg); 557 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 558 559 // Optimistic form: consider XORL tmpReg,tmpReg 560 movptr(tmpReg, NULL_WORD); 561 562 // Appears unlocked - try to swing _owner from null to non-null. 563 // Ideally, I'd manifest "Self" with get_thread and then attempt 564 // to CAS the register containing Self into m->Owner. 565 // But we don't have enough registers, so instead we can either try to CAS 566 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 567 // we later store "Self" into m->Owner. Transiently storing a stack address 568 // (rsp or the address of the box) into m->owner is harmless. 569 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 570 lock(); 571 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 572 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 573 // If we weren't able to swing _owner from NULL to the BasicLock 574 // then take the slow path. 575 jccb (Assembler::notZero, DONE_LABEL); 576 // update _owner from BasicLock to thread 577 get_thread (scrReg); // beware: clobbers ICCs 578 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 579 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 580 581 // If the CAS fails we can either retry or pass control to the slow path. 582 // We use the latter tactic. 583 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 584 // If the CAS was successful ... 585 // Self has acquired the lock 586 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 587 // Intentional fall-through into DONE_LABEL ... 588 #else // _LP64 589 // It's inflated and we use scrReg for ObjectMonitor* in this section. 590 movq(scrReg, tmpReg); 591 xorq(tmpReg, tmpReg); 592 lock(); 593 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 594 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 595 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 596 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 597 // Intentional fall-through into DONE_LABEL ... 598 // Propagate ICC.ZF from CAS above into DONE_LABEL. 599 #endif // _LP64 600 #if INCLUDE_RTM_OPT 601 } // use_rtm() 602 #endif 603 // DONE_LABEL is a hot target - we'd really like to place it at the 604 // start of cache line by padding with NOPs. 605 // See the AMD and Intel software optimization manuals for the 606 // most efficient "long" NOP encodings. 607 // Unfortunately none of our alignment mechanisms suffice. 608 bind(DONE_LABEL); 609 610 // At DONE_LABEL the icc ZFlag is set as follows ... 611 // fast_unlock uses the same protocol. 612 // ZFlag == 1 -> Success 613 // ZFlag == 0 -> Failure - force control through the slow path 614 } 615 616 // obj: object to unlock 617 // box: box address (displaced header location), killed. Must be EAX. 618 // tmp: killed, cannot be obj nor box. 619 // 620 // Some commentary on balanced locking: 621 // 622 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 623 // Methods that don't have provably balanced locking are forced to run in the 624 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 625 // The interpreter provides two properties: 626 // I1: At return-time the interpreter automatically and quietly unlocks any 627 // objects acquired the current activation (frame). Recall that the 628 // interpreter maintains an on-stack list of locks currently held by 629 // a frame. 630 // I2: If a method attempts to unlock an object that is not held by the 631 // the frame the interpreter throws IMSX. 632 // 633 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 634 // B() doesn't have provably balanced locking so it runs in the interpreter. 635 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 636 // is still locked by A(). 637 // 638 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 639 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 640 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 641 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 642 // Arguably given that the spec legislates the JNI case as undefined our implementation 643 // could reasonably *avoid* checking owner in fast_unlock(). 644 // In the interest of performance we elide m->Owner==Self check in unlock. 645 // A perfectly viable alternative is to elide the owner check except when 646 // Xcheck:jni is enabled. 647 648 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 649 assert(boxReg == rax, ""); 650 assert_different_registers(objReg, boxReg, tmpReg); 651 652 Label DONE_LABEL, Stacked, CheckSucc; 653 654 // Critically, the biased locking test must have precedence over 655 // and appear before the (box->dhw == 0) recursive stack-lock test. 656 if (UseBiasedLocking && !UseOptoBiasInlining) { 657 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 658 } 659 660 #if INCLUDE_RTM_OPT 661 if (UseRTMForStackLocks && use_rtm) { 662 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 663 Label L_regular_unlock; 664 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 665 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 666 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 667 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 668 xend(); // otherwise end... 669 jmp(DONE_LABEL); // ... and we're done 670 bind(L_regular_unlock); 671 } 672 #endif 673 674 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 675 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 676 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 677 testptr(tmpReg, markWord::monitor_value); // Inflated? 678 jccb (Assembler::zero, Stacked); 679 680 // It's inflated. 681 #if INCLUDE_RTM_OPT 682 if (use_rtm) { 683 Label L_regular_inflated_unlock; 684 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 685 movptr(boxReg, Address(tmpReg, owner_offset)); 686 testptr(boxReg, boxReg); 687 jccb(Assembler::notZero, L_regular_inflated_unlock); 688 xend(); 689 jmpb(DONE_LABEL); 690 bind(L_regular_inflated_unlock); 691 } 692 #endif 693 694 // Despite our balanced locking property we still check that m->_owner == Self 695 // as java routines or native JNI code called by this thread might 696 // have released the lock. 697 // Refer to the comments in synchronizer.cpp for how we might encode extra 698 // state in _succ so we can avoid fetching EntryList|cxq. 699 // 700 // I'd like to add more cases in fast_lock() and fast_unlock() -- 701 // such as recursive enter and exit -- but we have to be wary of 702 // I$ bloat, T$ effects and BP$ effects. 703 // 704 // If there's no contention try a 1-0 exit. That is, exit without 705 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 706 // we detect and recover from the race that the 1-0 exit admits. 707 // 708 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 709 // before it STs null into _owner, releasing the lock. Updates 710 // to data protected by the critical section must be visible before 711 // we drop the lock (and thus before any other thread could acquire 712 // the lock and observe the fields protected by the lock). 713 // IA32's memory-model is SPO, so STs are ordered with respect to 714 // each other and there's no need for an explicit barrier (fence). 715 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 716 #ifndef _LP64 717 get_thread (boxReg); 718 719 // Note that we could employ various encoding schemes to reduce 720 // the number of loads below (currently 4) to just 2 or 3. 721 // Refer to the comments in synchronizer.cpp. 722 // In practice the chain of fetches doesn't seem to impact performance, however. 723 xorptr(boxReg, boxReg); 724 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 725 jccb (Assembler::notZero, DONE_LABEL); 726 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 727 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 728 jccb (Assembler::notZero, CheckSucc); 729 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 730 jmpb (DONE_LABEL); 731 732 bind (Stacked); 733 // It's not inflated and it's not recursively stack-locked and it's not biased. 734 // It must be stack-locked. 735 // Try to reset the header to displaced header. 736 // The "box" value on the stack is stable, so we can reload 737 // and be assured we observe the same value as above. 738 movptr(tmpReg, Address(boxReg, 0)); 739 lock(); 740 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 741 // Intention fall-thru into DONE_LABEL 742 743 // DONE_LABEL is a hot target - we'd really like to place it at the 744 // start of cache line by padding with NOPs. 745 // See the AMD and Intel software optimization manuals for the 746 // most efficient "long" NOP encodings. 747 // Unfortunately none of our alignment mechanisms suffice. 748 bind (CheckSucc); 749 #else // _LP64 750 // It's inflated 751 xorptr(boxReg, boxReg); 752 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 753 jccb (Assembler::notZero, DONE_LABEL); 754 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 755 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 756 jccb (Assembler::notZero, CheckSucc); 757 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 758 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 759 jmpb (DONE_LABEL); 760 761 // Try to avoid passing control into the slow_path ... 762 Label LSuccess, LGoSlowPath ; 763 bind (CheckSucc); 764 765 // The following optional optimization can be elided if necessary 766 // Effectively: if (succ == null) goto slow path 767 // The code reduces the window for a race, however, 768 // and thus benefits performance. 769 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 770 jccb (Assembler::zero, LGoSlowPath); 771 772 xorptr(boxReg, boxReg); 773 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 774 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 775 776 // Memory barrier/fence 777 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 778 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 779 // This is faster on Nehalem and AMD Shanghai/Barcelona. 780 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 781 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 782 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 783 lock(); addl(Address(rsp, 0), 0); 784 785 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 786 jccb (Assembler::notZero, LSuccess); 787 788 // Rare inopportune interleaving - race. 789 // The successor vanished in the small window above. 790 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 791 // We need to ensure progress and succession. 792 // Try to reacquire the lock. 793 // If that fails then the new owner is responsible for succession and this 794 // thread needs to take no further action and can exit via the fast path (success). 795 // If the re-acquire succeeds then pass control into the slow path. 796 // As implemented, this latter mode is horrible because we generated more 797 // coherence traffic on the lock *and* artifically extended the critical section 798 // length while by virtue of passing control into the slow path. 799 800 // box is really RAX -- the following CMPXCHG depends on that binding 801 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 802 lock(); 803 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 804 // There's no successor so we tried to regrab the lock. 805 // If that didn't work, then another thread grabbed the 806 // lock so we're done (and exit was a success). 807 jccb (Assembler::notEqual, LSuccess); 808 // Intentional fall-through into slow path 809 810 bind (LGoSlowPath); 811 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 812 jmpb (DONE_LABEL); 813 814 bind (LSuccess); 815 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 816 jmpb (DONE_LABEL); 817 818 bind (Stacked); 819 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 820 lock(); 821 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 822 823 #endif 824 bind(DONE_LABEL); 825 } 826 827 //------------------------------------------------------------------------------------------- 828 // Generic instructions support for use in .ad files C2 code generation 829 830 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 831 if (dst != src) { 832 movdqu(dst, src); 833 } 834 if (opcode == Op_AbsVD) { 835 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 836 } else { 837 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 838 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 839 } 840 } 841 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 843 if (opcode == Op_AbsVD) { 844 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 845 } else { 846 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 847 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 848 } 849 } 850 851 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 852 if (dst != src) { 853 movdqu(dst, src); 854 } 855 if (opcode == Op_AbsVF) { 856 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 857 } else { 858 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 859 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 860 } 861 } 862 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 864 if (opcode == Op_AbsVF) { 865 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 866 } else { 867 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 868 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 869 } 870 } 871 872 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 873 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 874 875 if (opcode == Op_MinV) { 876 if (elem_bt == T_BYTE) { 877 pminsb(dst, src); 878 } else if (elem_bt == T_SHORT) { 879 pminsw(dst, src); 880 } else if (elem_bt == T_INT) { 881 pminsd(dst, src); 882 } else { 883 assert(elem_bt == T_LONG, "required"); 884 assert(tmp == xmm0, "required"); 885 movdqu(xmm0, dst); 886 pcmpgtq(xmm0, src); 887 blendvpd(dst, src); // xmm0 as mask 888 } 889 } else { // opcode == Op_MaxV 890 if (elem_bt == T_BYTE) { 891 pmaxsb(dst, src); 892 } else if (elem_bt == T_SHORT) { 893 pmaxsw(dst, src); 894 } else if (elem_bt == T_INT) { 895 pmaxsd(dst, src); 896 } else { 897 assert(elem_bt == T_LONG, "required"); 898 assert(tmp == xmm0, "required"); 899 movdqu(xmm0, src); 900 pcmpgtq(xmm0, dst); 901 blendvpd(dst, src); // xmm0 as mask 902 } 903 } 904 } 905 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 907 XMMRegister dst, XMMRegister src1, XMMRegister src2, 908 int vlen_enc) { 909 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 910 911 if (opcode == Op_MinV) { 912 if (elem_bt == T_BYTE) { 913 vpminsb(dst, src1, src2, vlen_enc); 914 } else if (elem_bt == T_SHORT) { 915 vpminsw(dst, src1, src2, vlen_enc); 916 } else if (elem_bt == T_INT) { 917 vpminsd(dst, src1, src2, vlen_enc); 918 } else { 919 assert(elem_bt == T_LONG, "required"); 920 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 921 vpminsq(dst, src1, src2, vlen_enc); 922 } else { 923 vpcmpgtq(dst, src1, src2, vlen_enc); 924 vblendvpd(dst, src1, src2, dst, vlen_enc); 925 } 926 } 927 } else { // opcode == Op_MaxV 928 if (elem_bt == T_BYTE) { 929 vpmaxsb(dst, src1, src2, vlen_enc); 930 } else if (elem_bt == T_SHORT) { 931 vpmaxsw(dst, src1, src2, vlen_enc); 932 } else if (elem_bt == T_INT) { 933 vpmaxsd(dst, src1, src2, vlen_enc); 934 } else { 935 assert(elem_bt == T_LONG, "required"); 936 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 937 vpmaxsq(dst, src1, src2, vlen_enc); 938 } else { 939 vpcmpgtq(dst, src1, src2, vlen_enc); 940 vblendvpd(dst, src2, src1, dst, vlen_enc); 941 } 942 } 943 } 944 } 945 946 // Float/Double min max 947 948 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 949 XMMRegister dst, XMMRegister a, XMMRegister b, 950 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 951 int vlen_enc) { 952 assert(UseAVX > 0, "required"); 953 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 954 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 955 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 956 957 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 958 bool is_double_word = is_double_word_type(elem_bt); 959 960 if (!is_double_word && is_min) { 961 vblendvps(atmp, a, b, a, vlen_enc); 962 vblendvps(btmp, b, a, a, vlen_enc); 963 vminps(tmp, atmp, btmp, vlen_enc); 964 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 965 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 966 } else if (!is_double_word && !is_min) { 967 vblendvps(btmp, b, a, b, vlen_enc); 968 vblendvps(atmp, a, b, b, vlen_enc); 969 vmaxps(tmp, atmp, btmp, vlen_enc); 970 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 971 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 972 } else if (is_double_word && is_min) { 973 vblendvpd(atmp, a, b, a, vlen_enc); 974 vblendvpd(btmp, b, a, a, vlen_enc); 975 vminpd(tmp, atmp, btmp, vlen_enc); 976 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 977 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 978 } else { 979 assert(is_double_word && !is_min, "sanity"); 980 vblendvpd(btmp, b, a, b, vlen_enc); 981 vblendvpd(atmp, a, b, b, vlen_enc); 982 vmaxpd(tmp, atmp, btmp, vlen_enc); 983 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 984 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 985 } 986 } 987 988 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 989 XMMRegister dst, XMMRegister a, XMMRegister b, 990 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 991 int vlen_enc) { 992 assert(UseAVX > 2, "required"); 993 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 994 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 995 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 996 997 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 998 bool is_double_word = is_double_word_type(elem_bt); 999 bool merge = true; 1000 1001 if (!is_double_word && is_min) { 1002 evpmovd2m(ktmp, a, vlen_enc); 1003 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1004 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1005 vminps(dst, atmp, btmp, vlen_enc); 1006 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1007 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1008 } else if (!is_double_word && !is_min) { 1009 evpmovd2m(ktmp, b, vlen_enc); 1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1012 vmaxps(dst, atmp, btmp, vlen_enc); 1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1015 } else if (is_double_word && is_min) { 1016 evpmovq2m(ktmp, a, vlen_enc); 1017 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1018 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1019 vminpd(dst, atmp, btmp, vlen_enc); 1020 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1021 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1022 } else { 1023 assert(is_double_word && !is_min, "sanity"); 1024 evpmovq2m(ktmp, b, vlen_enc); 1025 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1026 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1027 vmaxpd(dst, atmp, btmp, vlen_enc); 1028 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1029 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1030 } 1031 } 1032 1033 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1034 if (sign) { 1035 pmovsxbw(dst, src); 1036 } else { 1037 pmovzxbw(dst, src); 1038 } 1039 } 1040 1041 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1042 if (sign) { 1043 vpmovsxbw(dst, src, vector_len); 1044 } else { 1045 vpmovzxbw(dst, src, vector_len); 1046 } 1047 } 1048 1049 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1050 if (sign) { 1051 vpmovsxbd(dst, src, vector_len); 1052 } else { 1053 vpmovzxbd(dst, src, vector_len); 1054 } 1055 } 1056 1057 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1058 if (sign) { 1059 vpmovsxwd(dst, src, vector_len); 1060 } else { 1061 vpmovzxwd(dst, src, vector_len); 1062 } 1063 } 1064 1065 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1066 switch (opcode) { 1067 case Op_RShiftVI: psrad(dst, shift); break; 1068 case Op_LShiftVI: pslld(dst, shift); break; 1069 case Op_URShiftVI: psrld(dst, shift); break; 1070 1071 default: assert(false, "%s", NodeClassNames[opcode]); 1072 } 1073 } 1074 1075 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1076 switch (opcode) { 1077 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1078 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1079 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1080 1081 default: assert(false, "%s", NodeClassNames[opcode]); 1082 } 1083 } 1084 1085 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1086 switch (opcode) { 1087 case Op_RShiftVB: // fall-through 1088 case Op_RShiftVS: psraw(dst, shift); break; 1089 1090 case Op_LShiftVB: // fall-through 1091 case Op_LShiftVS: psllw(dst, shift); break; 1092 1093 case Op_URShiftVS: // fall-through 1094 case Op_URShiftVB: psrlw(dst, shift); break; 1095 1096 default: assert(false, "%s", NodeClassNames[opcode]); 1097 } 1098 } 1099 1100 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1101 switch (opcode) { 1102 case Op_RShiftVB: // fall-through 1103 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1104 1105 case Op_LShiftVB: // fall-through 1106 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1107 1108 case Op_URShiftVS: // fall-through 1109 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1110 1111 default: assert(false, "%s", NodeClassNames[opcode]); 1112 } 1113 } 1114 1115 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1116 switch (opcode) { 1117 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1118 case Op_LShiftVL: psllq(dst, shift); break; 1119 case Op_URShiftVL: psrlq(dst, shift); break; 1120 1121 default: assert(false, "%s", NodeClassNames[opcode]); 1122 } 1123 } 1124 1125 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1126 switch (opcode) { 1127 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1128 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1129 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1130 1131 default: assert(false, "%s", NodeClassNames[opcode]); 1132 } 1133 } 1134 1135 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1136 switch (opcode) { 1137 case Op_VRShiftV: vpsravd(dst, src, shift, vlen_enc); break; 1138 case Op_VLShiftV: vpsllvd(dst, src, shift, vlen_enc); break; 1139 case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break; 1140 1141 default: assert(false, "%s", NodeClassNames[opcode]); 1142 } 1143 } 1144 1145 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1146 switch (opcode) { 1147 case Op_VRShiftV: evpsravw(dst, src, shift, vlen_enc); break; 1148 case Op_VLShiftV: evpsllvw(dst, src, shift, vlen_enc); break; 1149 case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break; 1150 1151 default: assert(false, "%s", NodeClassNames[opcode]); 1152 } 1153 } 1154 1155 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1156 assert(UseAVX >= 2, "required"); 1157 switch (opcode) { 1158 case Op_VRShiftV: { 1159 if (UseAVX > 2) { 1160 assert(tmp == xnoreg, "not used"); 1161 if (!VM_Version::supports_avx512vl()) { 1162 vlen_enc = Assembler::AVX_512bit; 1163 } 1164 evpsravq(dst, src, shift, vlen_enc); 1165 } else { 1166 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1167 vpsrlvq(dst, src, shift, vlen_enc); 1168 vpsrlvq(tmp, tmp, shift, vlen_enc); 1169 vpxor(dst, dst, tmp, vlen_enc); 1170 vpsubq(dst, dst, tmp, vlen_enc); 1171 } 1172 break; 1173 } 1174 case Op_VLShiftV: { 1175 assert(tmp == xnoreg, "not used"); 1176 vpsllvq(dst, src, shift, vlen_enc); 1177 break; 1178 } 1179 case Op_VURShiftV: { 1180 assert(tmp == xnoreg, "not used"); 1181 vpsrlvq(dst, src, shift, vlen_enc); 1182 break; 1183 } 1184 default: assert(false, "%s", NodeClassNames[opcode]); 1185 } 1186 } 1187 1188 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1189 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1190 bool sign = (opcode == Op_VURShiftV) ? false : true; 1191 assert(vector_len == 0, "required"); 1192 vextendbd(sign, dst, src, 1); 1193 vpmovzxbd(vtmp, shift, 1); 1194 varshiftd(opcode, dst, dst, vtmp, 1); 1195 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1196 vextracti128_high(vtmp, dst); 1197 vpackusdw(dst, dst, vtmp, 0); 1198 } 1199 1200 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1201 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1202 bool sign = (opcode == Op_VURShiftV) ? false : true; 1203 int ext_vector_len = vector_len + 1; 1204 vextendbw(sign, dst, src, ext_vector_len); 1205 vpmovzxbw(vtmp, shift, ext_vector_len); 1206 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1207 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1208 if (vector_len == 0) { 1209 vextracti128_high(vtmp, dst); 1210 vpackuswb(dst, dst, vtmp, vector_len); 1211 } else { 1212 vextracti64x4_high(vtmp, dst); 1213 vpackuswb(dst, dst, vtmp, vector_len); 1214 vpermq(dst, dst, 0xD8, vector_len); 1215 } 1216 } 1217 1218 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1219 switch(typ) { 1220 case T_BYTE: 1221 pinsrb(dst, val, idx); 1222 break; 1223 case T_SHORT: 1224 pinsrw(dst, val, idx); 1225 break; 1226 case T_INT: 1227 pinsrd(dst, val, idx); 1228 break; 1229 case T_LONG: 1230 pinsrq(dst, val, idx); 1231 break; 1232 default: 1233 assert(false,"Should not reach here."); 1234 break; 1235 } 1236 } 1237 1238 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1239 switch(typ) { 1240 case T_BYTE: 1241 vpinsrb(dst, src, val, idx); 1242 break; 1243 case T_SHORT: 1244 vpinsrw(dst, src, val, idx); 1245 break; 1246 case T_INT: 1247 vpinsrd(dst, src, val, idx); 1248 break; 1249 case T_LONG: 1250 vpinsrq(dst, src, val, idx); 1251 break; 1252 default: 1253 assert(false,"Should not reach here."); 1254 break; 1255 } 1256 } 1257 1258 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1259 switch(typ) { 1260 case T_INT: 1261 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1262 break; 1263 case T_FLOAT: 1264 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1265 break; 1266 case T_LONG: 1267 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1268 break; 1269 case T_DOUBLE: 1270 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1271 break; 1272 default: 1273 assert(false,"Should not reach here."); 1274 break; 1275 } 1276 } 1277 1278 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1279 switch(typ) { 1280 case T_INT: 1281 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1282 break; 1283 case T_FLOAT: 1284 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1285 break; 1286 case T_LONG: 1287 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1288 break; 1289 case T_DOUBLE: 1290 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1291 break; 1292 default: 1293 assert(false,"Should not reach here."); 1294 break; 1295 } 1296 } 1297 1298 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1299 switch(typ) { 1300 case T_INT: 1301 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1302 break; 1303 case T_FLOAT: 1304 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1305 break; 1306 case T_LONG: 1307 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1308 break; 1309 case T_DOUBLE: 1310 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1311 break; 1312 default: 1313 assert(false,"Should not reach here."); 1314 break; 1315 } 1316 } 1317 1318 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { 1319 if (vlen_in_bytes <= 16) { 1320 pxor (dst, dst); 1321 psubb(dst, src); 1322 switch (elem_bt) { 1323 case T_BYTE: /* nothing to do */ break; 1324 case T_SHORT: pmovsxbw(dst, dst); break; 1325 case T_INT: pmovsxbd(dst, dst); break; 1326 case T_FLOAT: pmovsxbd(dst, dst); break; 1327 case T_LONG: pmovsxbq(dst, dst); break; 1328 case T_DOUBLE: pmovsxbq(dst, dst); break; 1329 1330 default: assert(false, "%s", type2name(elem_bt)); 1331 } 1332 } else { 1333 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1334 1335 vpxor (dst, dst, dst, vlen_enc); 1336 vpsubb(dst, dst, src, vlen_enc); 1337 switch (elem_bt) { 1338 case T_BYTE: /* nothing to do */ break; 1339 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1340 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1341 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1342 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1343 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1344 1345 default: assert(false, "%s", type2name(elem_bt)); 1346 } 1347 } 1348 } 1349 1350 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1351 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1352 if (vlen_in_bytes <= 16) { 1353 movdqu(dst, addr, scratch); 1354 } else if (vlen_in_bytes == 32) { 1355 vmovdqu(dst, addr, scratch); 1356 } else { 1357 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1358 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1359 } 1360 } 1361 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1362 1363 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1364 int vector_len = Assembler::AVX_128bit; 1365 1366 switch (opcode) { 1367 case Op_AndReductionV: pand(dst, src); break; 1368 case Op_OrReductionV: por (dst, src); break; 1369 case Op_XorReductionV: pxor(dst, src); break; 1370 case Op_MinReductionV: 1371 switch (typ) { 1372 case T_BYTE: pminsb(dst, src); break; 1373 case T_SHORT: pminsw(dst, src); break; 1374 case T_INT: pminsd(dst, src); break; 1375 case T_LONG: assert(UseAVX > 2, "required"); 1376 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1377 default: assert(false, "wrong type"); 1378 } 1379 break; 1380 case Op_MaxReductionV: 1381 switch (typ) { 1382 case T_BYTE: pmaxsb(dst, src); break; 1383 case T_SHORT: pmaxsw(dst, src); break; 1384 case T_INT: pmaxsd(dst, src); break; 1385 case T_LONG: assert(UseAVX > 2, "required"); 1386 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1387 default: assert(false, "wrong type"); 1388 } 1389 break; 1390 case Op_AddReductionVF: addss(dst, src); break; 1391 case Op_AddReductionVD: addsd(dst, src); break; 1392 case Op_AddReductionVI: 1393 switch (typ) { 1394 case T_BYTE: paddb(dst, src); break; 1395 case T_SHORT: paddw(dst, src); break; 1396 case T_INT: paddd(dst, src); break; 1397 default: assert(false, "wrong type"); 1398 } 1399 break; 1400 case Op_AddReductionVL: paddq(dst, src); break; 1401 case Op_MulReductionVF: mulss(dst, src); break; 1402 case Op_MulReductionVD: mulsd(dst, src); break; 1403 case Op_MulReductionVI: 1404 switch (typ) { 1405 case T_SHORT: pmullw(dst, src); break; 1406 case T_INT: pmulld(dst, src); break; 1407 default: assert(false, "wrong type"); 1408 } 1409 break; 1410 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1411 vpmullq(dst, dst, src, vector_len); break; 1412 default: assert(false, "wrong opcode"); 1413 } 1414 } 1415 1416 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1417 int vector_len = Assembler::AVX_256bit; 1418 1419 switch (opcode) { 1420 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1421 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1422 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1423 case Op_MinReductionV: 1424 switch (typ) { 1425 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1426 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1427 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1428 case T_LONG: assert(UseAVX > 2, "required"); 1429 vpminsq(dst, src1, src2, vector_len); break; 1430 default: assert(false, "wrong type"); 1431 } 1432 break; 1433 case Op_MaxReductionV: 1434 switch (typ) { 1435 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1436 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1437 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1438 case T_LONG: assert(UseAVX > 2, "required"); 1439 vpmaxsq(dst, src1, src2, vector_len); break; 1440 default: assert(false, "wrong type"); 1441 } 1442 break; 1443 case Op_AddReductionVI: 1444 switch (typ) { 1445 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1446 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1447 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1448 default: assert(false, "wrong type"); 1449 } 1450 break; 1451 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1452 case Op_MulReductionVI: 1453 switch (typ) { 1454 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1455 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1456 default: assert(false, "wrong type"); 1457 } 1458 break; 1459 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1460 default: assert(false, "wrong opcode"); 1461 } 1462 } 1463 1464 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1465 XMMRegister dst, XMMRegister src, 1466 XMMRegister vtmp1, XMMRegister vtmp2) { 1467 switch (opcode) { 1468 case Op_AddReductionVF: 1469 case Op_MulReductionVF: 1470 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1471 break; 1472 1473 case Op_AddReductionVD: 1474 case Op_MulReductionVD: 1475 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1476 break; 1477 1478 default: assert(false, "wrong opcode"); 1479 } 1480 } 1481 1482 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1483 Register dst, Register src1, XMMRegister src2, 1484 XMMRegister vtmp1, XMMRegister vtmp2) { 1485 switch (vlen) { 1486 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1487 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1488 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1489 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1490 1491 default: assert(false, "wrong vector length"); 1492 } 1493 } 1494 1495 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1496 Register dst, Register src1, XMMRegister src2, 1497 XMMRegister vtmp1, XMMRegister vtmp2) { 1498 switch (vlen) { 1499 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1500 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1501 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1502 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1503 1504 default: assert(false, "wrong vector length"); 1505 } 1506 } 1507 1508 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1509 Register dst, Register src1, XMMRegister src2, 1510 XMMRegister vtmp1, XMMRegister vtmp2) { 1511 switch (vlen) { 1512 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1513 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1514 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1515 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1516 1517 default: assert(false, "wrong vector length"); 1518 } 1519 } 1520 1521 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1522 Register dst, Register src1, XMMRegister src2, 1523 XMMRegister vtmp1, XMMRegister vtmp2) { 1524 switch (vlen) { 1525 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1526 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1527 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1528 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1529 1530 default: assert(false, "wrong vector length"); 1531 } 1532 } 1533 1534 #ifdef _LP64 1535 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1536 Register dst, Register src1, XMMRegister src2, 1537 XMMRegister vtmp1, XMMRegister vtmp2) { 1538 switch (vlen) { 1539 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1540 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1541 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1542 1543 default: assert(false, "wrong vector length"); 1544 } 1545 } 1546 #endif // _LP64 1547 1548 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1549 switch (vlen) { 1550 case 2: 1551 assert(vtmp2 == xnoreg, ""); 1552 reduce2F(opcode, dst, src, vtmp1); 1553 break; 1554 case 4: 1555 assert(vtmp2 == xnoreg, ""); 1556 reduce4F(opcode, dst, src, vtmp1); 1557 break; 1558 case 8: 1559 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1560 break; 1561 case 16: 1562 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1563 break; 1564 default: assert(false, "wrong vector length"); 1565 } 1566 } 1567 1568 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1569 switch (vlen) { 1570 case 2: 1571 assert(vtmp2 == xnoreg, ""); 1572 reduce2D(opcode, dst, src, vtmp1); 1573 break; 1574 case 4: 1575 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1576 break; 1577 case 8: 1578 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1579 break; 1580 default: assert(false, "wrong vector length"); 1581 } 1582 } 1583 1584 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1585 if (opcode == Op_AddReductionVI) { 1586 if (vtmp1 != src2) { 1587 movdqu(vtmp1, src2); 1588 } 1589 phaddd(vtmp1, vtmp1); 1590 } else { 1591 pshufd(vtmp1, src2, 0x1); 1592 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1593 } 1594 movdl(vtmp2, src1); 1595 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1596 movdl(dst, vtmp1); 1597 } 1598 1599 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1600 if (opcode == Op_AddReductionVI) { 1601 if (vtmp1 != src2) { 1602 movdqu(vtmp1, src2); 1603 } 1604 phaddd(vtmp1, src2); 1605 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1606 } else { 1607 pshufd(vtmp2, src2, 0xE); 1608 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1609 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1610 } 1611 } 1612 1613 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1614 if (opcode == Op_AddReductionVI) { 1615 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1616 vextracti128_high(vtmp2, vtmp1); 1617 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1618 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1619 } else { 1620 vextracti128_high(vtmp1, src2); 1621 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1622 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1623 } 1624 } 1625 1626 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1627 vextracti64x4_high(vtmp2, src2); 1628 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1629 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1630 } 1631 1632 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1633 pshufd(vtmp2, src2, 0x1); 1634 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1635 movdqu(vtmp1, vtmp2); 1636 psrldq(vtmp1, 2); 1637 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1638 movdqu(vtmp2, vtmp1); 1639 psrldq(vtmp2, 1); 1640 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1641 movdl(vtmp2, src1); 1642 pmovsxbd(vtmp1, vtmp1); 1643 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1644 pextrb(dst, vtmp1, 0x0); 1645 movsbl(dst, dst); 1646 } 1647 1648 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1649 pshufd(vtmp1, src2, 0xE); 1650 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1651 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1652 } 1653 1654 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1655 vextracti128_high(vtmp2, src2); 1656 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1657 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1658 } 1659 1660 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1661 vextracti64x4_high(vtmp1, src2); 1662 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1663 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1664 } 1665 1666 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1667 pmovsxbw(vtmp2, src2); 1668 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1669 } 1670 1671 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1672 if (UseAVX > 1) { 1673 int vector_len = Assembler::AVX_256bit; 1674 vpmovsxbw(vtmp1, src2, vector_len); 1675 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1676 } else { 1677 pmovsxbw(vtmp2, src2); 1678 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1679 pshufd(vtmp2, src2, 0x1); 1680 pmovsxbw(vtmp2, src2); 1681 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1682 } 1683 } 1684 1685 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1686 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1687 int vector_len = Assembler::AVX_512bit; 1688 vpmovsxbw(vtmp1, src2, vector_len); 1689 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1690 } else { 1691 assert(UseAVX >= 2,"Should not reach here."); 1692 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1693 vextracti128_high(vtmp2, src2); 1694 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1695 } 1696 } 1697 1698 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1699 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1700 vextracti64x4_high(vtmp2, src2); 1701 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1702 } 1703 1704 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1705 if (opcode == Op_AddReductionVI) { 1706 if (vtmp1 != src2) { 1707 movdqu(vtmp1, src2); 1708 } 1709 phaddw(vtmp1, vtmp1); 1710 phaddw(vtmp1, vtmp1); 1711 } else { 1712 pshufd(vtmp2, src2, 0x1); 1713 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1714 movdqu(vtmp1, vtmp2); 1715 psrldq(vtmp1, 2); 1716 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1717 } 1718 movdl(vtmp2, src1); 1719 pmovsxwd(vtmp1, vtmp1); 1720 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1721 pextrw(dst, vtmp1, 0x0); 1722 movswl(dst, dst); 1723 } 1724 1725 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1726 if (opcode == Op_AddReductionVI) { 1727 if (vtmp1 != src2) { 1728 movdqu(vtmp1, src2); 1729 } 1730 phaddw(vtmp1, src2); 1731 } else { 1732 pshufd(vtmp1, src2, 0xE); 1733 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1734 } 1735 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1736 } 1737 1738 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1739 if (opcode == Op_AddReductionVI) { 1740 int vector_len = Assembler::AVX_256bit; 1741 vphaddw(vtmp2, src2, src2, vector_len); 1742 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1743 } else { 1744 vextracti128_high(vtmp2, src2); 1745 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1746 } 1747 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1748 } 1749 1750 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1751 int vector_len = Assembler::AVX_256bit; 1752 vextracti64x4_high(vtmp1, src2); 1753 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1754 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1755 } 1756 1757 #ifdef _LP64 1758 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1759 pshufd(vtmp2, src2, 0xE); 1760 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1761 movdq(vtmp1, src1); 1762 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1763 movdq(dst, vtmp1); 1764 } 1765 1766 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1767 vextracti128_high(vtmp1, src2); 1768 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1769 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1770 } 1771 1772 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1773 vextracti64x4_high(vtmp2, src2); 1774 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1775 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1776 } 1777 #endif // _LP64 1778 1779 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1780 reduce_operation_128(T_FLOAT, opcode, dst, src); 1781 pshufd(vtmp, src, 0x1); 1782 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1783 } 1784 1785 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1786 reduce2F(opcode, dst, src, vtmp); 1787 pshufd(vtmp, src, 0x2); 1788 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1789 pshufd(vtmp, src, 0x3); 1790 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1791 } 1792 1793 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1794 reduce4F(opcode, dst, src, vtmp2); 1795 vextractf128_high(vtmp2, src); 1796 reduce4F(opcode, dst, vtmp2, vtmp1); 1797 } 1798 1799 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1800 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1801 vextracti64x4_high(vtmp1, src); 1802 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1803 } 1804 1805 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1806 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1807 pshufd(vtmp, src, 0xE); 1808 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1809 } 1810 1811 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1812 reduce2D(opcode, dst, src, vtmp2); 1813 vextractf128_high(vtmp2, src); 1814 reduce2D(opcode, dst, vtmp2, vtmp1); 1815 } 1816 1817 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1818 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1819 vextracti64x4_high(vtmp1, src); 1820 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1821 } 1822 1823 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 1824 XMMRegister dst, XMMRegister src, 1825 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1826 XMMRegister xmm_0, XMMRegister xmm_1) { 1827 int permconst[] = {1, 14}; 1828 XMMRegister wsrc = src; 1829 XMMRegister wdst = xmm_0; 1830 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1831 1832 int vlen_enc = Assembler::AVX_128bit; 1833 if (vlen == 16) { 1834 vlen_enc = Assembler::AVX_256bit; 1835 } 1836 1837 for (int i = log2(vlen) - 1; i >=0; i--) { 1838 if (i == 0 && !is_dst_valid) { 1839 wdst = dst; 1840 } 1841 if (i == 3) { 1842 vextracti64x4_high(wtmp, wsrc); 1843 } else if (i == 2) { 1844 vextracti128_high(wtmp, wsrc); 1845 } else { // i = [0,1] 1846 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 1847 } 1848 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1849 wsrc = wdst; 1850 vlen_enc = Assembler::AVX_128bit; 1851 } 1852 if (is_dst_valid) { 1853 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1854 } 1855 } 1856 1857 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 1858 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1859 XMMRegister xmm_0, XMMRegister xmm_1) { 1860 XMMRegister wsrc = src; 1861 XMMRegister wdst = xmm_0; 1862 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1863 int vlen_enc = Assembler::AVX_128bit; 1864 if (vlen == 8) { 1865 vlen_enc = Assembler::AVX_256bit; 1866 } 1867 for (int i = log2(vlen) - 1; i >=0; i--) { 1868 if (i == 0 && !is_dst_valid) { 1869 wdst = dst; 1870 } 1871 if (i == 1) { 1872 vextracti128_high(wtmp, wsrc); 1873 } else if (i == 2) { 1874 vextracti64x4_high(wtmp, wsrc); 1875 } else { 1876 assert(i == 0, "%d", i); 1877 vpermilpd(wtmp, wsrc, 1, vlen_enc); 1878 } 1879 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1880 wsrc = wdst; 1881 vlen_enc = Assembler::AVX_128bit; 1882 } 1883 if (is_dst_valid) { 1884 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1885 } 1886 } 1887 1888 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 1889 switch (bt) { 1890 case T_BYTE: pextrb(dst, src, idx); break; 1891 case T_SHORT: pextrw(dst, src, idx); break; 1892 case T_INT: pextrd(dst, src, idx); break; 1893 case T_LONG: pextrq(dst, src, idx); break; 1894 1895 default: 1896 assert(false,"Should not reach here."); 1897 break; 1898 } 1899 } 1900 1901 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 1902 int esize = type2aelembytes(typ); 1903 int elem_per_lane = 16/esize; 1904 int lane = elemindex / elem_per_lane; 1905 int eindex = elemindex % elem_per_lane; 1906 1907 if (lane >= 2) { 1908 assert(UseAVX > 2, "required"); 1909 vextractf32x4(dst, src, lane & 3); 1910 return dst; 1911 } else if (lane > 0) { 1912 assert(UseAVX > 0, "required"); 1913 vextractf128(dst, src, lane); 1914 return dst; 1915 } else { 1916 return src; 1917 } 1918 } 1919 1920 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 1921 int esize = type2aelembytes(typ); 1922 int elem_per_lane = 16/esize; 1923 int eindex = elemindex % elem_per_lane; 1924 assert(is_integral_type(typ),"required"); 1925 1926 if (eindex == 0) { 1927 if (typ == T_LONG) { 1928 movq(dst, src); 1929 } else { 1930 movdl(dst, src); 1931 if (typ == T_BYTE) 1932 movsbl(dst, dst); 1933 else if (typ == T_SHORT) 1934 movswl(dst, dst); 1935 } 1936 } else { 1937 extract(typ, dst, src, eindex); 1938 } 1939 } 1940 1941 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 1942 int esize = type2aelembytes(typ); 1943 int elem_per_lane = 16/esize; 1944 int eindex = elemindex % elem_per_lane; 1945 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 1946 1947 if (eindex == 0) { 1948 movq(dst, src); 1949 } else { 1950 if (typ == T_FLOAT) { 1951 if (UseAVX == 0) { 1952 movdqu(dst, src); 1953 pshufps(dst, dst, eindex); 1954 } else { 1955 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 1956 } 1957 } else { 1958 if (UseAVX == 0) { 1959 movdqu(dst, src); 1960 psrldq(dst, eindex*esize); 1961 } else { 1962 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 1963 } 1964 movq(dst, dst); 1965 } 1966 } 1967 // Zero upper bits 1968 if (typ == T_FLOAT) { 1969 if (UseAVX == 0) { 1970 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 1971 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 1972 pand(dst, vtmp); 1973 } else { 1974 assert((tmp != noreg), "required."); 1975 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 1976 } 1977 } 1978 } 1979 1980 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 1981 switch(typ) { 1982 case T_BYTE: 1983 evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1984 break; 1985 case T_SHORT: 1986 evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1987 break; 1988 case T_INT: 1989 case T_FLOAT: 1990 evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1991 break; 1992 case T_LONG: 1993 case T_DOUBLE: 1994 evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1995 break; 1996 default: 1997 assert(false,"Should not reach here."); 1998 break; 1999 } 2000 } 2001 2002 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2003 switch(typ) { 2004 case T_BYTE: 2005 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2006 break; 2007 case T_SHORT: 2008 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2009 break; 2010 case T_INT: 2011 case T_FLOAT: 2012 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2013 break; 2014 case T_LONG: 2015 case T_DOUBLE: 2016 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2017 break; 2018 default: 2019 assert(false,"Should not reach here."); 2020 break; 2021 } 2022 } 2023 2024 //------------------------------------------------------------------------------------------- 2025 2026 // IndexOf for constant substrings with size >= 8 chars 2027 // which don't need to be loaded through stack. 2028 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2029 Register cnt1, Register cnt2, 2030 int int_cnt2, Register result, 2031 XMMRegister vec, Register tmp, 2032 int ae) { 2033 ShortBranchVerifier sbv(this); 2034 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2035 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2036 2037 // This method uses the pcmpestri instruction with bound registers 2038 // inputs: 2039 // xmm - substring 2040 // rax - substring length (elements count) 2041 // mem - scanned string 2042 // rdx - string length (elements count) 2043 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2044 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2045 // outputs: 2046 // rcx - matched index in string 2047 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2048 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2049 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2050 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2051 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2052 2053 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2054 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2055 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2056 2057 // Note, inline_string_indexOf() generates checks: 2058 // if (substr.count > string.count) return -1; 2059 // if (substr.count == 0) return 0; 2060 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2061 2062 // Load substring. 2063 if (ae == StrIntrinsicNode::UL) { 2064 pmovzxbw(vec, Address(str2, 0)); 2065 } else { 2066 movdqu(vec, Address(str2, 0)); 2067 } 2068 movl(cnt2, int_cnt2); 2069 movptr(result, str1); // string addr 2070 2071 if (int_cnt2 > stride) { 2072 jmpb(SCAN_TO_SUBSTR); 2073 2074 // Reload substr for rescan, this code 2075 // is executed only for large substrings (> 8 chars) 2076 bind(RELOAD_SUBSTR); 2077 if (ae == StrIntrinsicNode::UL) { 2078 pmovzxbw(vec, Address(str2, 0)); 2079 } else { 2080 movdqu(vec, Address(str2, 0)); 2081 } 2082 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2083 2084 bind(RELOAD_STR); 2085 // We came here after the beginning of the substring was 2086 // matched but the rest of it was not so we need to search 2087 // again. Start from the next element after the previous match. 2088 2089 // cnt2 is number of substring reminding elements and 2090 // cnt1 is number of string reminding elements when cmp failed. 2091 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2092 subl(cnt1, cnt2); 2093 addl(cnt1, int_cnt2); 2094 movl(cnt2, int_cnt2); // Now restore cnt2 2095 2096 decrementl(cnt1); // Shift to next element 2097 cmpl(cnt1, cnt2); 2098 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2099 2100 addptr(result, (1<<scale1)); 2101 2102 } // (int_cnt2 > 8) 2103 2104 // Scan string for start of substr in 16-byte vectors 2105 bind(SCAN_TO_SUBSTR); 2106 pcmpestri(vec, Address(result, 0), mode); 2107 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2108 subl(cnt1, stride); 2109 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2110 cmpl(cnt1, cnt2); 2111 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2112 addptr(result, 16); 2113 jmpb(SCAN_TO_SUBSTR); 2114 2115 // Found a potential substr 2116 bind(FOUND_CANDIDATE); 2117 // Matched whole vector if first element matched (tmp(rcx) == 0). 2118 if (int_cnt2 == stride) { 2119 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2120 } else { // int_cnt2 > 8 2121 jccb(Assembler::overflow, FOUND_SUBSTR); 2122 } 2123 // After pcmpestri tmp(rcx) contains matched element index 2124 // Compute start addr of substr 2125 lea(result, Address(result, tmp, scale1)); 2126 2127 // Make sure string is still long enough 2128 subl(cnt1, tmp); 2129 cmpl(cnt1, cnt2); 2130 if (int_cnt2 == stride) { 2131 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2132 } else { // int_cnt2 > 8 2133 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2134 } 2135 // Left less then substring. 2136 2137 bind(RET_NOT_FOUND); 2138 movl(result, -1); 2139 jmp(EXIT); 2140 2141 if (int_cnt2 > stride) { 2142 // This code is optimized for the case when whole substring 2143 // is matched if its head is matched. 2144 bind(MATCH_SUBSTR_HEAD); 2145 pcmpestri(vec, Address(result, 0), mode); 2146 // Reload only string if does not match 2147 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2148 2149 Label CONT_SCAN_SUBSTR; 2150 // Compare the rest of substring (> 8 chars). 2151 bind(FOUND_SUBSTR); 2152 // First 8 chars are already matched. 2153 negptr(cnt2); 2154 addptr(cnt2, stride); 2155 2156 bind(SCAN_SUBSTR); 2157 subl(cnt1, stride); 2158 cmpl(cnt2, -stride); // Do not read beyond substring 2159 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2160 // Back-up strings to avoid reading beyond substring: 2161 // cnt1 = cnt1 - cnt2 + 8 2162 addl(cnt1, cnt2); // cnt2 is negative 2163 addl(cnt1, stride); 2164 movl(cnt2, stride); negptr(cnt2); 2165 bind(CONT_SCAN_SUBSTR); 2166 if (int_cnt2 < (int)G) { 2167 int tail_off1 = int_cnt2<<scale1; 2168 int tail_off2 = int_cnt2<<scale2; 2169 if (ae == StrIntrinsicNode::UL) { 2170 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2171 } else { 2172 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2173 } 2174 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2175 } else { 2176 // calculate index in register to avoid integer overflow (int_cnt2*2) 2177 movl(tmp, int_cnt2); 2178 addptr(tmp, cnt2); 2179 if (ae == StrIntrinsicNode::UL) { 2180 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2181 } else { 2182 movdqu(vec, Address(str2, tmp, scale2, 0)); 2183 } 2184 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2185 } 2186 // Need to reload strings pointers if not matched whole vector 2187 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2188 addptr(cnt2, stride); 2189 jcc(Assembler::negative, SCAN_SUBSTR); 2190 // Fall through if found full substring 2191 2192 } // (int_cnt2 > 8) 2193 2194 bind(RET_FOUND); 2195 // Found result if we matched full small substring. 2196 // Compute substr offset 2197 subptr(result, str1); 2198 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2199 shrl(result, 1); // index 2200 } 2201 bind(EXIT); 2202 2203 } // string_indexofC8 2204 2205 // Small strings are loaded through stack if they cross page boundary. 2206 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2207 Register cnt1, Register cnt2, 2208 int int_cnt2, Register result, 2209 XMMRegister vec, Register tmp, 2210 int ae) { 2211 ShortBranchVerifier sbv(this); 2212 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2213 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2214 2215 // 2216 // int_cnt2 is length of small (< 8 chars) constant substring 2217 // or (-1) for non constant substring in which case its length 2218 // is in cnt2 register. 2219 // 2220 // Note, inline_string_indexOf() generates checks: 2221 // if (substr.count > string.count) return -1; 2222 // if (substr.count == 0) return 0; 2223 // 2224 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2225 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2226 // This method uses the pcmpestri instruction with bound registers 2227 // inputs: 2228 // xmm - substring 2229 // rax - substring length (elements count) 2230 // mem - scanned string 2231 // rdx - string length (elements count) 2232 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2233 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2234 // outputs: 2235 // rcx - matched index in string 2236 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2237 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2238 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2239 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2240 2241 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2242 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2243 FOUND_CANDIDATE; 2244 2245 { //======================================================== 2246 // We don't know where these strings are located 2247 // and we can't read beyond them. Load them through stack. 2248 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2249 2250 movptr(tmp, rsp); // save old SP 2251 2252 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2253 if (int_cnt2 == (1>>scale2)) { // One byte 2254 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2255 load_unsigned_byte(result, Address(str2, 0)); 2256 movdl(vec, result); // move 32 bits 2257 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2258 // Not enough header space in 32-bit VM: 12+3 = 15. 2259 movl(result, Address(str2, -1)); 2260 shrl(result, 8); 2261 movdl(vec, result); // move 32 bits 2262 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2263 load_unsigned_short(result, Address(str2, 0)); 2264 movdl(vec, result); // move 32 bits 2265 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2266 movdl(vec, Address(str2, 0)); // move 32 bits 2267 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2268 movq(vec, Address(str2, 0)); // move 64 bits 2269 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2270 // Array header size is 12 bytes in 32-bit VM 2271 // + 6 bytes for 3 chars == 18 bytes, 2272 // enough space to load vec and shift. 2273 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2274 if (ae == StrIntrinsicNode::UL) { 2275 int tail_off = int_cnt2-8; 2276 pmovzxbw(vec, Address(str2, tail_off)); 2277 psrldq(vec, -2*tail_off); 2278 } 2279 else { 2280 int tail_off = int_cnt2*(1<<scale2); 2281 movdqu(vec, Address(str2, tail_off-16)); 2282 psrldq(vec, 16-tail_off); 2283 } 2284 } 2285 } else { // not constant substring 2286 cmpl(cnt2, stride); 2287 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2288 2289 // We can read beyond string if srt+16 does not cross page boundary 2290 // since heaps are aligned and mapped by pages. 2291 assert(os::vm_page_size() < (int)G, "default page should be small"); 2292 movl(result, str2); // We need only low 32 bits 2293 andl(result, (os::vm_page_size()-1)); 2294 cmpl(result, (os::vm_page_size()-16)); 2295 jccb(Assembler::belowEqual, CHECK_STR); 2296 2297 // Move small strings to stack to allow load 16 bytes into vec. 2298 subptr(rsp, 16); 2299 int stk_offset = wordSize-(1<<scale2); 2300 push(cnt2); 2301 2302 bind(COPY_SUBSTR); 2303 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2304 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2305 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2306 } else if (ae == StrIntrinsicNode::UU) { 2307 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2308 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2309 } 2310 decrement(cnt2); 2311 jccb(Assembler::notZero, COPY_SUBSTR); 2312 2313 pop(cnt2); 2314 movptr(str2, rsp); // New substring address 2315 } // non constant 2316 2317 bind(CHECK_STR); 2318 cmpl(cnt1, stride); 2319 jccb(Assembler::aboveEqual, BIG_STRINGS); 2320 2321 // Check cross page boundary. 2322 movl(result, str1); // We need only low 32 bits 2323 andl(result, (os::vm_page_size()-1)); 2324 cmpl(result, (os::vm_page_size()-16)); 2325 jccb(Assembler::belowEqual, BIG_STRINGS); 2326 2327 subptr(rsp, 16); 2328 int stk_offset = -(1<<scale1); 2329 if (int_cnt2 < 0) { // not constant 2330 push(cnt2); 2331 stk_offset += wordSize; 2332 } 2333 movl(cnt2, cnt1); 2334 2335 bind(COPY_STR); 2336 if (ae == StrIntrinsicNode::LL) { 2337 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2338 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2339 } else { 2340 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2341 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2342 } 2343 decrement(cnt2); 2344 jccb(Assembler::notZero, COPY_STR); 2345 2346 if (int_cnt2 < 0) { // not constant 2347 pop(cnt2); 2348 } 2349 movptr(str1, rsp); // New string address 2350 2351 bind(BIG_STRINGS); 2352 // Load substring. 2353 if (int_cnt2 < 0) { // -1 2354 if (ae == StrIntrinsicNode::UL) { 2355 pmovzxbw(vec, Address(str2, 0)); 2356 } else { 2357 movdqu(vec, Address(str2, 0)); 2358 } 2359 push(cnt2); // substr count 2360 push(str2); // substr addr 2361 push(str1); // string addr 2362 } else { 2363 // Small (< 8 chars) constant substrings are loaded already. 2364 movl(cnt2, int_cnt2); 2365 } 2366 push(tmp); // original SP 2367 2368 } // Finished loading 2369 2370 //======================================================== 2371 // Start search 2372 // 2373 2374 movptr(result, str1); // string addr 2375 2376 if (int_cnt2 < 0) { // Only for non constant substring 2377 jmpb(SCAN_TO_SUBSTR); 2378 2379 // SP saved at sp+0 2380 // String saved at sp+1*wordSize 2381 // Substr saved at sp+2*wordSize 2382 // Substr count saved at sp+3*wordSize 2383 2384 // Reload substr for rescan, this code 2385 // is executed only for large substrings (> 8 chars) 2386 bind(RELOAD_SUBSTR); 2387 movptr(str2, Address(rsp, 2*wordSize)); 2388 movl(cnt2, Address(rsp, 3*wordSize)); 2389 if (ae == StrIntrinsicNode::UL) { 2390 pmovzxbw(vec, Address(str2, 0)); 2391 } else { 2392 movdqu(vec, Address(str2, 0)); 2393 } 2394 // We came here after the beginning of the substring was 2395 // matched but the rest of it was not so we need to search 2396 // again. Start from the next element after the previous match. 2397 subptr(str1, result); // Restore counter 2398 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2399 shrl(str1, 1); 2400 } 2401 addl(cnt1, str1); 2402 decrementl(cnt1); // Shift to next element 2403 cmpl(cnt1, cnt2); 2404 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2405 2406 addptr(result, (1<<scale1)); 2407 } // non constant 2408 2409 // Scan string for start of substr in 16-byte vectors 2410 bind(SCAN_TO_SUBSTR); 2411 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2412 pcmpestri(vec, Address(result, 0), mode); 2413 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2414 subl(cnt1, stride); 2415 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2416 cmpl(cnt1, cnt2); 2417 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2418 addptr(result, 16); 2419 2420 bind(ADJUST_STR); 2421 cmpl(cnt1, stride); // Do not read beyond string 2422 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2423 // Back-up string to avoid reading beyond string. 2424 lea(result, Address(result, cnt1, scale1, -16)); 2425 movl(cnt1, stride); 2426 jmpb(SCAN_TO_SUBSTR); 2427 2428 // Found a potential substr 2429 bind(FOUND_CANDIDATE); 2430 // After pcmpestri tmp(rcx) contains matched element index 2431 2432 // Make sure string is still long enough 2433 subl(cnt1, tmp); 2434 cmpl(cnt1, cnt2); 2435 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2436 // Left less then substring. 2437 2438 bind(RET_NOT_FOUND); 2439 movl(result, -1); 2440 jmp(CLEANUP); 2441 2442 bind(FOUND_SUBSTR); 2443 // Compute start addr of substr 2444 lea(result, Address(result, tmp, scale1)); 2445 if (int_cnt2 > 0) { // Constant substring 2446 // Repeat search for small substring (< 8 chars) 2447 // from new point without reloading substring. 2448 // Have to check that we don't read beyond string. 2449 cmpl(tmp, stride-int_cnt2); 2450 jccb(Assembler::greater, ADJUST_STR); 2451 // Fall through if matched whole substring. 2452 } else { // non constant 2453 assert(int_cnt2 == -1, "should be != 0"); 2454 2455 addl(tmp, cnt2); 2456 // Found result if we matched whole substring. 2457 cmpl(tmp, stride); 2458 jcc(Assembler::lessEqual, RET_FOUND); 2459 2460 // Repeat search for small substring (<= 8 chars) 2461 // from new point 'str1' without reloading substring. 2462 cmpl(cnt2, stride); 2463 // Have to check that we don't read beyond string. 2464 jccb(Assembler::lessEqual, ADJUST_STR); 2465 2466 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2467 // Compare the rest of substring (> 8 chars). 2468 movptr(str1, result); 2469 2470 cmpl(tmp, cnt2); 2471 // First 8 chars are already matched. 2472 jccb(Assembler::equal, CHECK_NEXT); 2473 2474 bind(SCAN_SUBSTR); 2475 pcmpestri(vec, Address(str1, 0), mode); 2476 // Need to reload strings pointers if not matched whole vector 2477 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2478 2479 bind(CHECK_NEXT); 2480 subl(cnt2, stride); 2481 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2482 addptr(str1, 16); 2483 if (ae == StrIntrinsicNode::UL) { 2484 addptr(str2, 8); 2485 } else { 2486 addptr(str2, 16); 2487 } 2488 subl(cnt1, stride); 2489 cmpl(cnt2, stride); // Do not read beyond substring 2490 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2491 // Back-up strings to avoid reading beyond substring. 2492 2493 if (ae == StrIntrinsicNode::UL) { 2494 lea(str2, Address(str2, cnt2, scale2, -8)); 2495 lea(str1, Address(str1, cnt2, scale1, -16)); 2496 } else { 2497 lea(str2, Address(str2, cnt2, scale2, -16)); 2498 lea(str1, Address(str1, cnt2, scale1, -16)); 2499 } 2500 subl(cnt1, cnt2); 2501 movl(cnt2, stride); 2502 addl(cnt1, stride); 2503 bind(CONT_SCAN_SUBSTR); 2504 if (ae == StrIntrinsicNode::UL) { 2505 pmovzxbw(vec, Address(str2, 0)); 2506 } else { 2507 movdqu(vec, Address(str2, 0)); 2508 } 2509 jmp(SCAN_SUBSTR); 2510 2511 bind(RET_FOUND_LONG); 2512 movptr(str1, Address(rsp, wordSize)); 2513 } // non constant 2514 2515 bind(RET_FOUND); 2516 // Compute substr offset 2517 subptr(result, str1); 2518 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2519 shrl(result, 1); // index 2520 } 2521 bind(CLEANUP); 2522 pop(rsp); // restore SP 2523 2524 } // string_indexof 2525 2526 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2527 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2528 ShortBranchVerifier sbv(this); 2529 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2530 2531 int stride = 8; 2532 2533 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2534 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2535 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2536 FOUND_SEQ_CHAR, DONE_LABEL; 2537 2538 movptr(result, str1); 2539 if (UseAVX >= 2) { 2540 cmpl(cnt1, stride); 2541 jcc(Assembler::less, SCAN_TO_CHAR); 2542 cmpl(cnt1, 2*stride); 2543 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2544 movdl(vec1, ch); 2545 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2546 vpxor(vec2, vec2); 2547 movl(tmp, cnt1); 2548 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2549 andl(cnt1,0x0000000F); //tail count (in chars) 2550 2551 bind(SCAN_TO_16_CHAR_LOOP); 2552 vmovdqu(vec3, Address(result, 0)); 2553 vpcmpeqw(vec3, vec3, vec1, 1); 2554 vptest(vec2, vec3); 2555 jcc(Assembler::carryClear, FOUND_CHAR); 2556 addptr(result, 32); 2557 subl(tmp, 2*stride); 2558 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2559 jmp(SCAN_TO_8_CHAR); 2560 bind(SCAN_TO_8_CHAR_INIT); 2561 movdl(vec1, ch); 2562 pshuflw(vec1, vec1, 0x00); 2563 pshufd(vec1, vec1, 0); 2564 pxor(vec2, vec2); 2565 } 2566 bind(SCAN_TO_8_CHAR); 2567 cmpl(cnt1, stride); 2568 jcc(Assembler::less, SCAN_TO_CHAR); 2569 if (UseAVX < 2) { 2570 movdl(vec1, ch); 2571 pshuflw(vec1, vec1, 0x00); 2572 pshufd(vec1, vec1, 0); 2573 pxor(vec2, vec2); 2574 } 2575 movl(tmp, cnt1); 2576 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2577 andl(cnt1,0x00000007); //tail count (in chars) 2578 2579 bind(SCAN_TO_8_CHAR_LOOP); 2580 movdqu(vec3, Address(result, 0)); 2581 pcmpeqw(vec3, vec1); 2582 ptest(vec2, vec3); 2583 jcc(Assembler::carryClear, FOUND_CHAR); 2584 addptr(result, 16); 2585 subl(tmp, stride); 2586 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2587 bind(SCAN_TO_CHAR); 2588 testl(cnt1, cnt1); 2589 jcc(Assembler::zero, RET_NOT_FOUND); 2590 bind(SCAN_TO_CHAR_LOOP); 2591 load_unsigned_short(tmp, Address(result, 0)); 2592 cmpl(ch, tmp); 2593 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2594 addptr(result, 2); 2595 subl(cnt1, 1); 2596 jccb(Assembler::zero, RET_NOT_FOUND); 2597 jmp(SCAN_TO_CHAR_LOOP); 2598 2599 bind(RET_NOT_FOUND); 2600 movl(result, -1); 2601 jmpb(DONE_LABEL); 2602 2603 bind(FOUND_CHAR); 2604 if (UseAVX >= 2) { 2605 vpmovmskb(tmp, vec3); 2606 } else { 2607 pmovmskb(tmp, vec3); 2608 } 2609 bsfl(ch, tmp); 2610 addl(result, ch); 2611 2612 bind(FOUND_SEQ_CHAR); 2613 subptr(result, str1); 2614 shrl(result, 1); 2615 2616 bind(DONE_LABEL); 2617 } // string_indexof_char 2618 2619 // helper function for string_compare 2620 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 2621 Address::ScaleFactor scale, Address::ScaleFactor scale1, 2622 Address::ScaleFactor scale2, Register index, int ae) { 2623 if (ae == StrIntrinsicNode::LL) { 2624 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 2625 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 2626 } else if (ae == StrIntrinsicNode::UU) { 2627 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 2628 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 2629 } else { 2630 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 2631 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 2632 } 2633 } 2634 2635 // Compare strings, used for char[] and byte[]. 2636 void C2_MacroAssembler::string_compare(Register str1, Register str2, 2637 Register cnt1, Register cnt2, Register result, 2638 XMMRegister vec1, int ae) { 2639 ShortBranchVerifier sbv(this); 2640 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 2641 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 2642 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 2643 int stride2x2 = 0x40; 2644 Address::ScaleFactor scale = Address::no_scale; 2645 Address::ScaleFactor scale1 = Address::no_scale; 2646 Address::ScaleFactor scale2 = Address::no_scale; 2647 2648 if (ae != StrIntrinsicNode::LL) { 2649 stride2x2 = 0x20; 2650 } 2651 2652 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 2653 shrl(cnt2, 1); 2654 } 2655 // Compute the minimum of the string lengths and the 2656 // difference of the string lengths (stack). 2657 // Do the conditional move stuff 2658 movl(result, cnt1); 2659 subl(cnt1, cnt2); 2660 push(cnt1); 2661 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 2662 2663 // Is the minimum length zero? 2664 testl(cnt2, cnt2); 2665 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2666 if (ae == StrIntrinsicNode::LL) { 2667 // Load first bytes 2668 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 2669 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 2670 } else if (ae == StrIntrinsicNode::UU) { 2671 // Load first characters 2672 load_unsigned_short(result, Address(str1, 0)); 2673 load_unsigned_short(cnt1, Address(str2, 0)); 2674 } else { 2675 load_unsigned_byte(result, Address(str1, 0)); 2676 load_unsigned_short(cnt1, Address(str2, 0)); 2677 } 2678 subl(result, cnt1); 2679 jcc(Assembler::notZero, POP_LABEL); 2680 2681 if (ae == StrIntrinsicNode::UU) { 2682 // Divide length by 2 to get number of chars 2683 shrl(cnt2, 1); 2684 } 2685 cmpl(cnt2, 1); 2686 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2687 2688 // Check if the strings start at the same location and setup scale and stride 2689 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2690 cmpptr(str1, str2); 2691 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2692 if (ae == StrIntrinsicNode::LL) { 2693 scale = Address::times_1; 2694 stride = 16; 2695 } else { 2696 scale = Address::times_2; 2697 stride = 8; 2698 } 2699 } else { 2700 scale1 = Address::times_1; 2701 scale2 = Address::times_2; 2702 // scale not used 2703 stride = 8; 2704 } 2705 2706 if (UseAVX >= 2 && UseSSE42Intrinsics) { 2707 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 2708 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 2709 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 2710 Label COMPARE_TAIL_LONG; 2711 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 2712 2713 int pcmpmask = 0x19; 2714 if (ae == StrIntrinsicNode::LL) { 2715 pcmpmask &= ~0x01; 2716 } 2717 2718 // Setup to compare 16-chars (32-bytes) vectors, 2719 // start from first character again because it has aligned address. 2720 if (ae == StrIntrinsicNode::LL) { 2721 stride2 = 32; 2722 } else { 2723 stride2 = 16; 2724 } 2725 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2726 adr_stride = stride << scale; 2727 } else { 2728 adr_stride1 = 8; //stride << scale1; 2729 adr_stride2 = 16; //stride << scale2; 2730 } 2731 2732 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2733 // rax and rdx are used by pcmpestri as elements counters 2734 movl(result, cnt2); 2735 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 2736 jcc(Assembler::zero, COMPARE_TAIL_LONG); 2737 2738 // fast path : compare first 2 8-char vectors. 2739 bind(COMPARE_16_CHARS); 2740 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2741 movdqu(vec1, Address(str1, 0)); 2742 } else { 2743 pmovzxbw(vec1, Address(str1, 0)); 2744 } 2745 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2746 jccb(Assembler::below, COMPARE_INDEX_CHAR); 2747 2748 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2749 movdqu(vec1, Address(str1, adr_stride)); 2750 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 2751 } else { 2752 pmovzxbw(vec1, Address(str1, adr_stride1)); 2753 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 2754 } 2755 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 2756 addl(cnt1, stride); 2757 2758 // Compare the characters at index in cnt1 2759 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 2760 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2761 subl(result, cnt2); 2762 jmp(POP_LABEL); 2763 2764 // Setup the registers to start vector comparison loop 2765 bind(COMPARE_WIDE_VECTORS); 2766 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2767 lea(str1, Address(str1, result, scale)); 2768 lea(str2, Address(str2, result, scale)); 2769 } else { 2770 lea(str1, Address(str1, result, scale1)); 2771 lea(str2, Address(str2, result, scale2)); 2772 } 2773 subl(result, stride2); 2774 subl(cnt2, stride2); 2775 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 2776 negptr(result); 2777 2778 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 2779 bind(COMPARE_WIDE_VECTORS_LOOP); 2780 2781 #ifdef _LP64 2782 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 2783 cmpl(cnt2, stride2x2); 2784 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 2785 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 2786 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 2787 2788 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 2789 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2790 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 2791 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2792 } else { 2793 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 2794 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2795 } 2796 kortestql(k7, k7); 2797 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 2798 addptr(result, stride2x2); // update since we already compared at this addr 2799 subl(cnt2, stride2x2); // and sub the size too 2800 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 2801 2802 vpxor(vec1, vec1); 2803 jmpb(COMPARE_WIDE_TAIL); 2804 }//if (VM_Version::supports_avx512vlbw()) 2805 #endif // _LP64 2806 2807 2808 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2809 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2810 vmovdqu(vec1, Address(str1, result, scale)); 2811 vpxor(vec1, Address(str2, result, scale)); 2812 } else { 2813 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 2814 vpxor(vec1, Address(str2, result, scale2)); 2815 } 2816 vptest(vec1, vec1); 2817 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 2818 addptr(result, stride2); 2819 subl(cnt2, stride2); 2820 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 2821 // clean upper bits of YMM registers 2822 vpxor(vec1, vec1); 2823 2824 // compare wide vectors tail 2825 bind(COMPARE_WIDE_TAIL); 2826 testptr(result, result); 2827 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2828 2829 movl(result, stride2); 2830 movl(cnt2, result); 2831 negptr(result); 2832 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2833 2834 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 2835 bind(VECTOR_NOT_EQUAL); 2836 // clean upper bits of YMM registers 2837 vpxor(vec1, vec1); 2838 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2839 lea(str1, Address(str1, result, scale)); 2840 lea(str2, Address(str2, result, scale)); 2841 } else { 2842 lea(str1, Address(str1, result, scale1)); 2843 lea(str2, Address(str2, result, scale2)); 2844 } 2845 jmp(COMPARE_16_CHARS); 2846 2847 // Compare tail chars, length between 1 to 15 chars 2848 bind(COMPARE_TAIL_LONG); 2849 movl(cnt2, result); 2850 cmpl(cnt2, stride); 2851 jcc(Assembler::less, COMPARE_SMALL_STR); 2852 2853 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2854 movdqu(vec1, Address(str1, 0)); 2855 } else { 2856 pmovzxbw(vec1, Address(str1, 0)); 2857 } 2858 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2859 jcc(Assembler::below, COMPARE_INDEX_CHAR); 2860 subptr(cnt2, stride); 2861 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2862 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2863 lea(str1, Address(str1, result, scale)); 2864 lea(str2, Address(str2, result, scale)); 2865 } else { 2866 lea(str1, Address(str1, result, scale1)); 2867 lea(str2, Address(str2, result, scale2)); 2868 } 2869 negptr(cnt2); 2870 jmpb(WHILE_HEAD_LABEL); 2871 2872 bind(COMPARE_SMALL_STR); 2873 } else if (UseSSE42Intrinsics) { 2874 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 2875 int pcmpmask = 0x19; 2876 // Setup to compare 8-char (16-byte) vectors, 2877 // start from first character again because it has aligned address. 2878 movl(result, cnt2); 2879 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 2880 if (ae == StrIntrinsicNode::LL) { 2881 pcmpmask &= ~0x01; 2882 } 2883 jcc(Assembler::zero, COMPARE_TAIL); 2884 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2885 lea(str1, Address(str1, result, scale)); 2886 lea(str2, Address(str2, result, scale)); 2887 } else { 2888 lea(str1, Address(str1, result, scale1)); 2889 lea(str2, Address(str2, result, scale2)); 2890 } 2891 negptr(result); 2892 2893 // pcmpestri 2894 // inputs: 2895 // vec1- substring 2896 // rax - negative string length (elements count) 2897 // mem - scanned string 2898 // rdx - string length (elements count) 2899 // pcmpmask - cmp mode: 11000 (string compare with negated result) 2900 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2901 // outputs: 2902 // rcx - first mismatched element index 2903 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2904 2905 bind(COMPARE_WIDE_VECTORS); 2906 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2907 movdqu(vec1, Address(str1, result, scale)); 2908 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2909 } else { 2910 pmovzxbw(vec1, Address(str1, result, scale1)); 2911 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2912 } 2913 // After pcmpestri cnt1(rcx) contains mismatched element index 2914 2915 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 2916 addptr(result, stride); 2917 subptr(cnt2, stride); 2918 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 2919 2920 // compare wide vectors tail 2921 testptr(result, result); 2922 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2923 2924 movl(cnt2, stride); 2925 movl(result, stride); 2926 negptr(result); 2927 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2928 movdqu(vec1, Address(str1, result, scale)); 2929 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2930 } else { 2931 pmovzxbw(vec1, Address(str1, result, scale1)); 2932 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2933 } 2934 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 2935 2936 // Mismatched characters in the vectors 2937 bind(VECTOR_NOT_EQUAL); 2938 addptr(cnt1, result); 2939 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2940 subl(result, cnt2); 2941 jmpb(POP_LABEL); 2942 2943 bind(COMPARE_TAIL); // limit is zero 2944 movl(cnt2, result); 2945 // Fallthru to tail compare 2946 } 2947 // Shift str2 and str1 to the end of the arrays, negate min 2948 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2949 lea(str1, Address(str1, cnt2, scale)); 2950 lea(str2, Address(str2, cnt2, scale)); 2951 } else { 2952 lea(str1, Address(str1, cnt2, scale1)); 2953 lea(str2, Address(str2, cnt2, scale2)); 2954 } 2955 decrementl(cnt2); // first character was compared already 2956 negptr(cnt2); 2957 2958 // Compare the rest of the elements 2959 bind(WHILE_HEAD_LABEL); 2960 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 2961 subl(result, cnt1); 2962 jccb(Assembler::notZero, POP_LABEL); 2963 increment(cnt2); 2964 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 2965 2966 // Strings are equal up to min length. Return the length difference. 2967 bind(LENGTH_DIFF_LABEL); 2968 pop(result); 2969 if (ae == StrIntrinsicNode::UU) { 2970 // Divide diff by 2 to get number of chars 2971 sarl(result, 1); 2972 } 2973 jmpb(DONE_LABEL); 2974 2975 #ifdef _LP64 2976 if (VM_Version::supports_avx512vlbw()) { 2977 2978 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 2979 2980 kmovql(cnt1, k7); 2981 notq(cnt1); 2982 bsfq(cnt2, cnt1); 2983 if (ae != StrIntrinsicNode::LL) { 2984 // Divide diff by 2 to get number of chars 2985 sarl(cnt2, 1); 2986 } 2987 addq(result, cnt2); 2988 if (ae == StrIntrinsicNode::LL) { 2989 load_unsigned_byte(cnt1, Address(str2, result)); 2990 load_unsigned_byte(result, Address(str1, result)); 2991 } else if (ae == StrIntrinsicNode::UU) { 2992 load_unsigned_short(cnt1, Address(str2, result, scale)); 2993 load_unsigned_short(result, Address(str1, result, scale)); 2994 } else { 2995 load_unsigned_short(cnt1, Address(str2, result, scale2)); 2996 load_unsigned_byte(result, Address(str1, result, scale1)); 2997 } 2998 subl(result, cnt1); 2999 jmpb(POP_LABEL); 3000 }//if (VM_Version::supports_avx512vlbw()) 3001 #endif // _LP64 3002 3003 // Discard the stored length difference 3004 bind(POP_LABEL); 3005 pop(cnt1); 3006 3007 // That's it 3008 bind(DONE_LABEL); 3009 if(ae == StrIntrinsicNode::UL) { 3010 negl(result); 3011 } 3012 3013 } 3014 3015 // Search for Non-ASCII character (Negative byte value) in a byte array, 3016 // return true if it has any and false otherwise. 3017 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3018 // @HotSpotIntrinsicCandidate 3019 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3020 // for (int i = off; i < off + len; i++) { 3021 // if (ba[i] < 0) { 3022 // return true; 3023 // } 3024 // } 3025 // return false; 3026 // } 3027 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3028 Register result, Register tmp1, 3029 XMMRegister vec1, XMMRegister vec2) { 3030 // rsi: byte array 3031 // rcx: len 3032 // rax: result 3033 ShortBranchVerifier sbv(this); 3034 assert_different_registers(ary1, len, result, tmp1); 3035 assert_different_registers(vec1, vec2); 3036 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3037 3038 // len == 0 3039 testl(len, len); 3040 jcc(Assembler::zero, FALSE_LABEL); 3041 3042 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3043 VM_Version::supports_avx512vlbw() && 3044 VM_Version::supports_bmi2()) { 3045 3046 Label test_64_loop, test_tail; 3047 Register tmp3_aliased = len; 3048 3049 movl(tmp1, len); 3050 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3051 3052 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3053 andl(len, ~(64 - 1)); // vector count (in chars) 3054 jccb(Assembler::zero, test_tail); 3055 3056 lea(ary1, Address(ary1, len, Address::times_1)); 3057 negptr(len); 3058 3059 bind(test_64_loop); 3060 // Check whether our 64 elements of size byte contain negatives 3061 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3062 kortestql(k2, k2); 3063 jcc(Assembler::notZero, TRUE_LABEL); 3064 3065 addptr(len, 64); 3066 jccb(Assembler::notZero, test_64_loop); 3067 3068 3069 bind(test_tail); 3070 // bail out when there is nothing to be done 3071 testl(tmp1, -1); 3072 jcc(Assembler::zero, FALSE_LABEL); 3073 3074 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3075 #ifdef _LP64 3076 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3077 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3078 notq(tmp3_aliased); 3079 kmovql(k3, tmp3_aliased); 3080 #else 3081 Label k_init; 3082 jmp(k_init); 3083 3084 // We could not read 64-bits from a general purpose register thus we move 3085 // data required to compose 64 1's to the instruction stream 3086 // We emit 64 byte wide series of elements from 0..63 which later on would 3087 // be used as a compare targets with tail count contained in tmp1 register. 3088 // Result would be a k register having tmp1 consecutive number or 1 3089 // counting from least significant bit. 3090 address tmp = pc(); 3091 emit_int64(0x0706050403020100); 3092 emit_int64(0x0F0E0D0C0B0A0908); 3093 emit_int64(0x1716151413121110); 3094 emit_int64(0x1F1E1D1C1B1A1918); 3095 emit_int64(0x2726252423222120); 3096 emit_int64(0x2F2E2D2C2B2A2928); 3097 emit_int64(0x3736353433323130); 3098 emit_int64(0x3F3E3D3C3B3A3938); 3099 3100 bind(k_init); 3101 lea(len, InternalAddress(tmp)); 3102 // create mask to test for negative byte inside a vector 3103 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3104 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); 3105 3106 #endif 3107 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3108 ktestq(k2, k3); 3109 jcc(Assembler::notZero, TRUE_LABEL); 3110 3111 jmp(FALSE_LABEL); 3112 } else { 3113 movl(result, len); // copy 3114 3115 if (UseAVX >= 2 && UseSSE >= 2) { 3116 // With AVX2, use 32-byte vector compare 3117 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3118 3119 // Compare 32-byte vectors 3120 andl(result, 0x0000001f); // tail count (in bytes) 3121 andl(len, 0xffffffe0); // vector count (in bytes) 3122 jccb(Assembler::zero, COMPARE_TAIL); 3123 3124 lea(ary1, Address(ary1, len, Address::times_1)); 3125 negptr(len); 3126 3127 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3128 movdl(vec2, tmp1); 3129 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3130 3131 bind(COMPARE_WIDE_VECTORS); 3132 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3133 vptest(vec1, vec2); 3134 jccb(Assembler::notZero, TRUE_LABEL); 3135 addptr(len, 32); 3136 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3137 3138 testl(result, result); 3139 jccb(Assembler::zero, FALSE_LABEL); 3140 3141 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3142 vptest(vec1, vec2); 3143 jccb(Assembler::notZero, TRUE_LABEL); 3144 jmpb(FALSE_LABEL); 3145 3146 bind(COMPARE_TAIL); // len is zero 3147 movl(len, result); 3148 // Fallthru to tail compare 3149 } else if (UseSSE42Intrinsics) { 3150 // With SSE4.2, use double quad vector compare 3151 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3152 3153 // Compare 16-byte vectors 3154 andl(result, 0x0000000f); // tail count (in bytes) 3155 andl(len, 0xfffffff0); // vector count (in bytes) 3156 jcc(Assembler::zero, COMPARE_TAIL); 3157 3158 lea(ary1, Address(ary1, len, Address::times_1)); 3159 negptr(len); 3160 3161 movl(tmp1, 0x80808080); 3162 movdl(vec2, tmp1); 3163 pshufd(vec2, vec2, 0); 3164 3165 bind(COMPARE_WIDE_VECTORS); 3166 movdqu(vec1, Address(ary1, len, Address::times_1)); 3167 ptest(vec1, vec2); 3168 jcc(Assembler::notZero, TRUE_LABEL); 3169 addptr(len, 16); 3170 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3171 3172 testl(result, result); 3173 jcc(Assembler::zero, FALSE_LABEL); 3174 3175 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3176 ptest(vec1, vec2); 3177 jccb(Assembler::notZero, TRUE_LABEL); 3178 jmpb(FALSE_LABEL); 3179 3180 bind(COMPARE_TAIL); // len is zero 3181 movl(len, result); 3182 // Fallthru to tail compare 3183 } 3184 } 3185 // Compare 4-byte vectors 3186 andl(len, 0xfffffffc); // vector count (in bytes) 3187 jccb(Assembler::zero, COMPARE_CHAR); 3188 3189 lea(ary1, Address(ary1, len, Address::times_1)); 3190 negptr(len); 3191 3192 bind(COMPARE_VECTORS); 3193 movl(tmp1, Address(ary1, len, Address::times_1)); 3194 andl(tmp1, 0x80808080); 3195 jccb(Assembler::notZero, TRUE_LABEL); 3196 addptr(len, 4); 3197 jcc(Assembler::notZero, COMPARE_VECTORS); 3198 3199 // Compare trailing char (final 2 bytes), if any 3200 bind(COMPARE_CHAR); 3201 testl(result, 0x2); // tail char 3202 jccb(Assembler::zero, COMPARE_BYTE); 3203 load_unsigned_short(tmp1, Address(ary1, 0)); 3204 andl(tmp1, 0x00008080); 3205 jccb(Assembler::notZero, TRUE_LABEL); 3206 subptr(result, 2); 3207 lea(ary1, Address(ary1, 2)); 3208 3209 bind(COMPARE_BYTE); 3210 testl(result, 0x1); // tail byte 3211 jccb(Assembler::zero, FALSE_LABEL); 3212 load_unsigned_byte(tmp1, Address(ary1, 0)); 3213 andl(tmp1, 0x00000080); 3214 jccb(Assembler::notEqual, TRUE_LABEL); 3215 jmpb(FALSE_LABEL); 3216 3217 bind(TRUE_LABEL); 3218 movl(result, 1); // return true 3219 jmpb(DONE); 3220 3221 bind(FALSE_LABEL); 3222 xorl(result, result); // return false 3223 3224 // That's it 3225 bind(DONE); 3226 if (UseAVX >= 2 && UseSSE >= 2) { 3227 // clean upper bits of YMM registers 3228 vpxor(vec1, vec1); 3229 vpxor(vec2, vec2); 3230 } 3231 } 3232 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3233 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3234 Register limit, Register result, Register chr, 3235 XMMRegister vec1, XMMRegister vec2, bool is_char) { 3236 ShortBranchVerifier sbv(this); 3237 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3238 3239 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3240 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3241 3242 if (is_array_equ) { 3243 // Check the input args 3244 cmpoop(ary1, ary2); 3245 jcc(Assembler::equal, TRUE_LABEL); 3246 3247 // Need additional checks for arrays_equals. 3248 testptr(ary1, ary1); 3249 jcc(Assembler::zero, FALSE_LABEL); 3250 testptr(ary2, ary2); 3251 jcc(Assembler::zero, FALSE_LABEL); 3252 3253 // Check the lengths 3254 movl(limit, Address(ary1, length_offset)); 3255 cmpl(limit, Address(ary2, length_offset)); 3256 jcc(Assembler::notEqual, FALSE_LABEL); 3257 } 3258 3259 // count == 0 3260 testl(limit, limit); 3261 jcc(Assembler::zero, TRUE_LABEL); 3262 3263 if (is_array_equ) { 3264 // Load array address 3265 lea(ary1, Address(ary1, base_offset)); 3266 lea(ary2, Address(ary2, base_offset)); 3267 } 3268 3269 if (is_array_equ && is_char) { 3270 // arrays_equals when used for char[]. 3271 shll(limit, 1); // byte count != 0 3272 } 3273 movl(result, limit); // copy 3274 3275 if (UseAVX >= 2) { 3276 // With AVX2, use 32-byte vector compare 3277 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3278 3279 // Compare 32-byte vectors 3280 andl(result, 0x0000001f); // tail count (in bytes) 3281 andl(limit, 0xffffffe0); // vector count (in bytes) 3282 jcc(Assembler::zero, COMPARE_TAIL); 3283 3284 lea(ary1, Address(ary1, limit, Address::times_1)); 3285 lea(ary2, Address(ary2, limit, Address::times_1)); 3286 negptr(limit); 3287 3288 #ifdef _LP64 3289 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3290 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3291 3292 cmpl(limit, -64); 3293 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3294 3295 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3296 3297 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3298 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3299 kortestql(k7, k7); 3300 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3301 addptr(limit, 64); // update since we already compared at this addr 3302 cmpl(limit, -64); 3303 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3304 3305 // At this point we may still need to compare -limit+result bytes. 3306 // We could execute the next two instruction and just continue via non-wide path: 3307 // cmpl(limit, 0); 3308 // jcc(Assembler::equal, COMPARE_TAIL); // true 3309 // But since we stopped at the points ary{1,2}+limit which are 3310 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3311 // (|limit| <= 32 and result < 32), 3312 // we may just compare the last 64 bytes. 3313 // 3314 addptr(result, -64); // it is safe, bc we just came from this area 3315 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3316 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3317 kortestql(k7, k7); 3318 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3319 3320 jmp(TRUE_LABEL); 3321 3322 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3323 3324 }//if (VM_Version::supports_avx512vlbw()) 3325 #endif //_LP64 3326 bind(COMPARE_WIDE_VECTORS); 3327 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3328 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3329 vpxor(vec1, vec2); 3330 3331 vptest(vec1, vec1); 3332 jcc(Assembler::notZero, FALSE_LABEL); 3333 addptr(limit, 32); 3334 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3335 3336 testl(result, result); 3337 jcc(Assembler::zero, TRUE_LABEL); 3338 3339 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3340 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3341 vpxor(vec1, vec2); 3342 3343 vptest(vec1, vec1); 3344 jccb(Assembler::notZero, FALSE_LABEL); 3345 jmpb(TRUE_LABEL); 3346 3347 bind(COMPARE_TAIL); // limit is zero 3348 movl(limit, result); 3349 // Fallthru to tail compare 3350 } else if (UseSSE42Intrinsics) { 3351 // With SSE4.2, use double quad vector compare 3352 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3353 3354 // Compare 16-byte vectors 3355 andl(result, 0x0000000f); // tail count (in bytes) 3356 andl(limit, 0xfffffff0); // vector count (in bytes) 3357 jcc(Assembler::zero, COMPARE_TAIL); 3358 3359 lea(ary1, Address(ary1, limit, Address::times_1)); 3360 lea(ary2, Address(ary2, limit, Address::times_1)); 3361 negptr(limit); 3362 3363 bind(COMPARE_WIDE_VECTORS); 3364 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3365 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3366 pxor(vec1, vec2); 3367 3368 ptest(vec1, vec1); 3369 jcc(Assembler::notZero, FALSE_LABEL); 3370 addptr(limit, 16); 3371 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3372 3373 testl(result, result); 3374 jcc(Assembler::zero, TRUE_LABEL); 3375 3376 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3377 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3378 pxor(vec1, vec2); 3379 3380 ptest(vec1, vec1); 3381 jccb(Assembler::notZero, FALSE_LABEL); 3382 jmpb(TRUE_LABEL); 3383 3384 bind(COMPARE_TAIL); // limit is zero 3385 movl(limit, result); 3386 // Fallthru to tail compare 3387 } 3388 3389 // Compare 4-byte vectors 3390 andl(limit, 0xfffffffc); // vector count (in bytes) 3391 jccb(Assembler::zero, COMPARE_CHAR); 3392 3393 lea(ary1, Address(ary1, limit, Address::times_1)); 3394 lea(ary2, Address(ary2, limit, Address::times_1)); 3395 negptr(limit); 3396 3397 bind(COMPARE_VECTORS); 3398 movl(chr, Address(ary1, limit, Address::times_1)); 3399 cmpl(chr, Address(ary2, limit, Address::times_1)); 3400 jccb(Assembler::notEqual, FALSE_LABEL); 3401 addptr(limit, 4); 3402 jcc(Assembler::notZero, COMPARE_VECTORS); 3403 3404 // Compare trailing char (final 2 bytes), if any 3405 bind(COMPARE_CHAR); 3406 testl(result, 0x2); // tail char 3407 jccb(Assembler::zero, COMPARE_BYTE); 3408 load_unsigned_short(chr, Address(ary1, 0)); 3409 load_unsigned_short(limit, Address(ary2, 0)); 3410 cmpl(chr, limit); 3411 jccb(Assembler::notEqual, FALSE_LABEL); 3412 3413 if (is_array_equ && is_char) { 3414 bind(COMPARE_BYTE); 3415 } else { 3416 lea(ary1, Address(ary1, 2)); 3417 lea(ary2, Address(ary2, 2)); 3418 3419 bind(COMPARE_BYTE); 3420 testl(result, 0x1); // tail byte 3421 jccb(Assembler::zero, TRUE_LABEL); 3422 load_unsigned_byte(chr, Address(ary1, 0)); 3423 load_unsigned_byte(limit, Address(ary2, 0)); 3424 cmpl(chr, limit); 3425 jccb(Assembler::notEqual, FALSE_LABEL); 3426 } 3427 bind(TRUE_LABEL); 3428 movl(result, 1); // return true 3429 jmpb(DONE); 3430 3431 bind(FALSE_LABEL); 3432 xorl(result, result); // return false 3433 3434 // That's it 3435 bind(DONE); 3436 if (UseAVX >= 2) { 3437 // clean upper bits of YMM registers 3438 vpxor(vec1, vec1); 3439 vpxor(vec2, vec2); 3440 } 3441 }