1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "runtime/biasedLocking.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 37 switch (vlen_in_bytes) { 38 case 4: // fall-through 39 case 8: // fall-through 40 case 16: return Assembler::AVX_128bit; 41 case 32: return Assembler::AVX_256bit; 42 case 64: return Assembler::AVX_512bit; 43 44 default: { 45 ShouldNotReachHere(); 46 return Assembler::AVX_NoVec; 47 } 48 } 49 } 50 51 void C2_MacroAssembler::setvectmask(Register dst, Register src) { 52 guarantee(PostLoopMultiversioning, "must be"); 53 Assembler::movl(dst, 1); 54 Assembler::shlxl(dst, dst, src); 55 Assembler::decl(dst); 56 Assembler::kmovdl(k1, dst); 57 Assembler::movl(dst, src); 58 } 59 60 void C2_MacroAssembler::restorevectmask() { 61 guarantee(PostLoopMultiversioning, "must be"); 62 Assembler::knotwl(k1, k0); 63 } 64 65 #if INCLUDE_RTM_OPT 66 67 // Update rtm_counters based on abort status 68 // input: abort_status 69 // rtm_counters (RTMLockingCounters*) 70 // flags are killed 71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 72 73 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 74 if (PrintPreciseRTMLockingStatistics) { 75 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 76 Label check_abort; 77 testl(abort_status, (1<<i)); 78 jccb(Assembler::equal, check_abort); 79 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 80 bind(check_abort); 81 } 82 } 83 } 84 85 // Branch if (random & (count-1) != 0), count is 2^n 86 // tmp, scr and flags are killed 87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 88 assert(tmp == rax, ""); 89 assert(scr == rdx, ""); 90 rdtsc(); // modifies EDX:EAX 91 andptr(tmp, count-1); 92 jccb(Assembler::notZero, brLabel); 93 } 94 95 // Perform abort ratio calculation, set no_rtm bit if high ratio 96 // input: rtm_counters_Reg (RTMLockingCounters* address) 97 // tmpReg, rtm_counters_Reg and flags are killed 98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 99 Register rtm_counters_Reg, 100 RTMLockingCounters* rtm_counters, 101 Metadata* method_data) { 102 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 103 104 if (RTMLockingCalculationDelay > 0) { 105 // Delay calculation 106 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 107 testptr(tmpReg, tmpReg); 108 jccb(Assembler::equal, L_done); 109 } 110 // Abort ratio calculation only if abort_count > RTMAbortThreshold 111 // Aborted transactions = abort_count * 100 112 // All transactions = total_count * RTMTotalCountIncrRate 113 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 114 115 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 116 cmpptr(tmpReg, RTMAbortThreshold); 117 jccb(Assembler::below, L_check_always_rtm2); 118 imulptr(tmpReg, tmpReg, 100); 119 120 Register scrReg = rtm_counters_Reg; 121 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 122 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 123 imulptr(scrReg, scrReg, RTMAbortRatio); 124 cmpptr(tmpReg, scrReg); 125 jccb(Assembler::below, L_check_always_rtm1); 126 if (method_data != NULL) { 127 // set rtm_state to "no rtm" in MDO 128 mov_metadata(tmpReg, method_data); 129 lock(); 130 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 131 } 132 jmpb(L_done); 133 bind(L_check_always_rtm1); 134 // Reload RTMLockingCounters* address 135 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 136 bind(L_check_always_rtm2); 137 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 138 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 139 jccb(Assembler::below, L_done); 140 if (method_data != NULL) { 141 // set rtm_state to "always rtm" in MDO 142 mov_metadata(tmpReg, method_data); 143 lock(); 144 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 145 } 146 bind(L_done); 147 } 148 149 // Update counters and perform abort ratio calculation 150 // input: abort_status_Reg 151 // rtm_counters_Reg, flags are killed 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 153 Register rtm_counters_Reg, 154 RTMLockingCounters* rtm_counters, 155 Metadata* method_data, 156 bool profile_rtm) { 157 158 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 159 // update rtm counters based on rax value at abort 160 // reads abort_status_Reg, updates flags 161 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 162 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 163 if (profile_rtm) { 164 // Save abort status because abort_status_Reg is used by following code. 165 if (RTMRetryCount > 0) { 166 push(abort_status_Reg); 167 } 168 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 169 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 170 // restore abort status 171 if (RTMRetryCount > 0) { 172 pop(abort_status_Reg); 173 } 174 } 175 } 176 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 178 // inputs: retry_count_Reg 179 // : abort_status_Reg 180 // output: retry_count_Reg decremented by 1 181 // flags are killed 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 183 Label doneRetry; 184 assert(abort_status_Reg == rax, ""); 185 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 186 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 187 // if reason is in 0x6 and retry count != 0 then retry 188 andptr(abort_status_Reg, 0x6); 189 jccb(Assembler::zero, doneRetry); 190 testl(retry_count_Reg, retry_count_Reg); 191 jccb(Assembler::zero, doneRetry); 192 pause(); 193 decrementl(retry_count_Reg); 194 jmp(retryLabel); 195 bind(doneRetry); 196 } 197 198 // Spin and retry if lock is busy, 199 // inputs: box_Reg (monitor address) 200 // : retry_count_Reg 201 // output: retry_count_Reg decremented by 1 202 // : clear z flag if retry count exceeded 203 // tmp_Reg, scr_Reg, flags are killed 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 205 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 206 Label SpinLoop, SpinExit, doneRetry; 207 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 208 209 testl(retry_count_Reg, retry_count_Reg); 210 jccb(Assembler::zero, doneRetry); 211 decrementl(retry_count_Reg); 212 movptr(scr_Reg, RTMSpinLoopCount); 213 214 bind(SpinLoop); 215 pause(); 216 decrementl(scr_Reg); 217 jccb(Assembler::lessEqual, SpinExit); 218 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 219 testptr(tmp_Reg, tmp_Reg); 220 jccb(Assembler::notZero, SpinLoop); 221 222 bind(SpinExit); 223 jmp(retryLabel); 224 bind(doneRetry); 225 incrementl(retry_count_Reg); // clear z flag 226 } 227 228 // Use RTM for normal stack locks 229 // Input: objReg (object to lock) 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 231 Register retry_on_abort_count_Reg, 232 RTMLockingCounters* stack_rtm_counters, 233 Metadata* method_data, bool profile_rtm, 234 Label& DONE_LABEL, Label& IsInflated) { 235 assert(UseRTMForStackLocks, "why call this otherwise?"); 236 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 237 assert(tmpReg == rax, ""); 238 assert(scrReg == rdx, ""); 239 Label L_rtm_retry, L_decrement_retry, L_on_abort; 240 241 if (RTMRetryCount > 0) { 242 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 243 bind(L_rtm_retry); 244 } 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 246 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 247 jcc(Assembler::notZero, IsInflated); 248 249 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 250 Label L_noincrement; 251 if (RTMTotalCountIncrRate > 1) { 252 // tmpReg, scrReg and flags are killed 253 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 254 } 255 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 256 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 257 bind(L_noincrement); 258 } 259 xbegin(L_on_abort); 260 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 261 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 262 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 263 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 264 265 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 266 if (UseRTMXendForLockBusy) { 267 xend(); 268 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 269 jmp(L_decrement_retry); 270 } 271 else { 272 xabort(0); 273 } 274 bind(L_on_abort); 275 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 276 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 277 } 278 bind(L_decrement_retry); 279 if (RTMRetryCount > 0) { 280 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 281 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 282 } 283 } 284 285 // Use RTM for inflating locks 286 // inputs: objReg (object to lock) 287 // boxReg (on-stack box address (displaced header location) - KILLED) 288 // tmpReg (ObjectMonitor address + markWord::monitor_value) 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 290 Register scrReg, Register retry_on_busy_count_Reg, 291 Register retry_on_abort_count_Reg, 292 RTMLockingCounters* rtm_counters, 293 Metadata* method_data, bool profile_rtm, 294 Label& DONE_LABEL) { 295 assert(UseRTMLocking, "why call this otherwise?"); 296 assert(tmpReg == rax, ""); 297 assert(scrReg == rdx, ""); 298 Label L_rtm_retry, L_decrement_retry, L_on_abort; 299 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 300 301 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 302 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 303 movptr(boxReg, tmpReg); // Save ObjectMonitor address 304 305 if (RTMRetryCount > 0) { 306 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 307 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 308 bind(L_rtm_retry); 309 } 310 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 311 Label L_noincrement; 312 if (RTMTotalCountIncrRate > 1) { 313 // tmpReg, scrReg and flags are killed 314 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 315 } 316 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 317 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 318 bind(L_noincrement); 319 } 320 xbegin(L_on_abort); 321 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 322 movptr(tmpReg, Address(tmpReg, owner_offset)); 323 testptr(tmpReg, tmpReg); 324 jcc(Assembler::zero, DONE_LABEL); 325 if (UseRTMXendForLockBusy) { 326 xend(); 327 jmp(L_decrement_retry); 328 } 329 else { 330 xabort(0); 331 } 332 bind(L_on_abort); 333 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 334 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 335 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 336 } 337 if (RTMRetryCount > 0) { 338 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 339 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 340 } 341 342 movptr(tmpReg, Address(boxReg, owner_offset)) ; 343 testptr(tmpReg, tmpReg) ; 344 jccb(Assembler::notZero, L_decrement_retry) ; 345 346 // Appears unlocked - try to swing _owner from null to non-null. 347 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 348 #ifdef _LP64 349 Register threadReg = r15_thread; 350 #else 351 get_thread(scrReg); 352 Register threadReg = scrReg; 353 #endif 354 lock(); 355 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 356 357 if (RTMRetryCount > 0) { 358 // success done else retry 359 jccb(Assembler::equal, DONE_LABEL) ; 360 bind(L_decrement_retry); 361 // Spin and retry if lock is busy. 362 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 363 } 364 else { 365 bind(L_decrement_retry); 366 } 367 } 368 369 #endif // INCLUDE_RTM_OPT 370 371 // fast_lock and fast_unlock used by C2 372 373 // Because the transitions from emitted code to the runtime 374 // monitorenter/exit helper stubs are so slow it's critical that 375 // we inline both the stack-locking fast path and the inflated fast path. 376 // 377 // See also: cmpFastLock and cmpFastUnlock. 378 // 379 // What follows is a specialized inline transliteration of the code 380 // in enter() and exit(). If we're concerned about I$ bloat another 381 // option would be to emit TrySlowEnter and TrySlowExit methods 382 // at startup-time. These methods would accept arguments as 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 384 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 386 // In practice, however, the # of lock sites is bounded and is usually small. 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 388 // if the processor uses simple bimodal branch predictors keyed by EIP 389 // Since the helper routines would be called from multiple synchronization 390 // sites. 391 // 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 394 // to those specialized methods. That'd give us a mostly platform-independent 395 // implementation that the JITs could optimize and inline at their pleasure. 396 // Done correctly, the only time we'd need to cross to native could would be 397 // to park() or unpark() threads. We'd also need a few more unsafe operators 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 399 // (b) explicit barriers or fence operations. 400 // 401 // TODO: 402 // 403 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 404 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 405 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 406 // the lock operators would typically be faster than reifying Self. 407 // 408 // * Ideally I'd define the primitives as: 409 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 410 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 411 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 412 // Instead, we're stuck with a rather awkward and brittle register assignments below. 413 // Furthermore the register assignments are overconstrained, possibly resulting in 414 // sub-optimal code near the synchronization site. 415 // 416 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 417 // Alternately, use a better sp-proximity test. 418 // 419 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 420 // Either one is sufficient to uniquely identify a thread. 421 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 422 // 423 // * Intrinsify notify() and notifyAll() for the common cases where the 424 // object is locked by the calling thread but the waitlist is empty. 425 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 426 // 427 // * use jccb and jmpb instead of jcc and jmp to improve code density. 428 // But beware of excessive branch density on AMD Opterons. 429 // 430 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 431 // or failure of the fast path. If the fast path fails then we pass 432 // control to the slow path, typically in C. In fast_lock and 433 // fast_unlock we often branch to DONE_LABEL, just to find that C2 434 // will emit a conditional branch immediately after the node. 435 // So we have branches to branches and lots of ICC.ZF games. 436 // Instead, it might be better to have C2 pass a "FailureLabel" 437 // into fast_lock and fast_unlock. In the case of success, control 438 // will drop through the node. ICC.ZF is undefined at exit. 439 // In the case of failure, the node will branch directly to the 440 // FailureLabel 441 442 443 // obj: object to lock 444 // box: on-stack box address (displaced header location) - KILLED 445 // rax,: tmp -- KILLED 446 // scr: tmp -- KILLED 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 448 Register scrReg, Register cx1Reg, Register cx2Reg, 449 BiasedLockingCounters* counters, 450 RTMLockingCounters* rtm_counters, 451 RTMLockingCounters* stack_rtm_counters, 452 Metadata* method_data, 453 bool use_rtm, bool profile_rtm) { 454 // Ensure the register assignments are disjoint 455 assert(tmpReg == rax, ""); 456 457 if (use_rtm) { 458 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 459 } else { 460 assert(cx1Reg == noreg, ""); 461 assert(cx2Reg == noreg, ""); 462 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 463 } 464 465 if (counters != NULL) { 466 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 467 } 468 469 // Possible cases that we'll encounter in fast_lock 470 // ------------------------------------------------ 471 // * Inflated 472 // -- unlocked 473 // -- Locked 474 // = by self 475 // = by other 476 // * biased 477 // -- by Self 478 // -- by other 479 // * neutral 480 // * stack-locked 481 // -- by self 482 // = sp-proximity test hits 483 // = sp-proximity test generates false-negative 484 // -- by other 485 // 486 487 Label IsInflated, DONE_LABEL; 488 489 // it's stack-locked, biased or neutral 490 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 491 // order to reduce the number of conditional branches in the most common cases. 492 // Beware -- there's a subtle invariant that fetch of the markword 493 // at [FETCH], below, will never observe a biased encoding (*101b). 494 // If this invariant is not held we risk exclusion (safety) failure. 495 if (UseBiasedLocking && !UseOptoBiasInlining) { 496 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); 497 } 498 499 #if INCLUDE_RTM_OPT 500 if (UseRTMForStackLocks && use_rtm) { 501 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 502 stack_rtm_counters, method_data, profile_rtm, 503 DONE_LABEL, IsInflated); 504 } 505 #endif // INCLUDE_RTM_OPT 506 507 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 508 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 509 jccb(Assembler::notZero, IsInflated); 510 511 // Attempt stack-locking ... 512 orptr (tmpReg, markWord::unlocked_value); 513 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 514 lock(); 515 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 516 if (counters != NULL) { 517 cond_inc32(Assembler::equal, 518 ExternalAddress((address)counters->fast_path_entry_count_addr())); 519 } 520 jcc(Assembler::equal, DONE_LABEL); // Success 521 522 // Recursive locking. 523 // The object is stack-locked: markword contains stack pointer to BasicLock. 524 // Locked by current thread if difference with current SP is less than one page. 525 subptr(tmpReg, rsp); 526 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 527 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 528 movptr(Address(boxReg, 0), tmpReg); 529 if (counters != NULL) { 530 cond_inc32(Assembler::equal, 531 ExternalAddress((address)counters->fast_path_entry_count_addr())); 532 } 533 jmp(DONE_LABEL); 534 535 bind(IsInflated); 536 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 537 538 #if INCLUDE_RTM_OPT 539 // Use the same RTM locking code in 32- and 64-bit VM. 540 if (use_rtm) { 541 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 542 rtm_counters, method_data, profile_rtm, DONE_LABEL); 543 } else { 544 #endif // INCLUDE_RTM_OPT 545 546 #ifndef _LP64 547 // The object is inflated. 548 549 // boxReg refers to the on-stack BasicLock in the current frame. 550 // We'd like to write: 551 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 552 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 553 // additional latency as we have another ST in the store buffer that must drain. 554 555 // avoid ST-before-CAS 556 // register juggle because we need tmpReg for cmpxchgptr below 557 movptr(scrReg, boxReg); 558 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 559 560 // Optimistic form: consider XORL tmpReg,tmpReg 561 movptr(tmpReg, NULL_WORD); 562 563 // Appears unlocked - try to swing _owner from null to non-null. 564 // Ideally, I'd manifest "Self" with get_thread and then attempt 565 // to CAS the register containing Self into m->Owner. 566 // But we don't have enough registers, so instead we can either try to CAS 567 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 568 // we later store "Self" into m->Owner. Transiently storing a stack address 569 // (rsp or the address of the box) into m->owner is harmless. 570 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 571 lock(); 572 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 573 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 574 // If we weren't able to swing _owner from NULL to the BasicLock 575 // then take the slow path. 576 jccb (Assembler::notZero, DONE_LABEL); 577 // update _owner from BasicLock to thread 578 get_thread (scrReg); // beware: clobbers ICCs 579 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 580 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 581 582 // If the CAS fails we can either retry or pass control to the slow path. 583 // We use the latter tactic. 584 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 585 // If the CAS was successful ... 586 // Self has acquired the lock 587 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 588 // Intentional fall-through into DONE_LABEL ... 589 #else // _LP64 590 // It's inflated and we use scrReg for ObjectMonitor* in this section. 591 movq(scrReg, tmpReg); 592 xorq(tmpReg, tmpReg); 593 lock(); 594 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 595 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 596 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 597 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 598 // Intentional fall-through into DONE_LABEL ... 599 // Propagate ICC.ZF from CAS above into DONE_LABEL. 600 #endif // _LP64 601 #if INCLUDE_RTM_OPT 602 } // use_rtm() 603 #endif 604 // DONE_LABEL is a hot target - we'd really like to place it at the 605 // start of cache line by padding with NOPs. 606 // See the AMD and Intel software optimization manuals for the 607 // most efficient "long" NOP encodings. 608 // Unfortunately none of our alignment mechanisms suffice. 609 bind(DONE_LABEL); 610 611 // At DONE_LABEL the icc ZFlag is set as follows ... 612 // fast_unlock uses the same protocol. 613 // ZFlag == 1 -> Success 614 // ZFlag == 0 -> Failure - force control through the slow path 615 } 616 617 // obj: object to unlock 618 // box: box address (displaced header location), killed. Must be EAX. 619 // tmp: killed, cannot be obj nor box. 620 // 621 // Some commentary on balanced locking: 622 // 623 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 624 // Methods that don't have provably balanced locking are forced to run in the 625 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 626 // The interpreter provides two properties: 627 // I1: At return-time the interpreter automatically and quietly unlocks any 628 // objects acquired the current activation (frame). Recall that the 629 // interpreter maintains an on-stack list of locks currently held by 630 // a frame. 631 // I2: If a method attempts to unlock an object that is not held by the 632 // the frame the interpreter throws IMSX. 633 // 634 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 635 // B() doesn't have provably balanced locking so it runs in the interpreter. 636 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 637 // is still locked by A(). 638 // 639 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 640 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 641 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 642 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 643 // Arguably given that the spec legislates the JNI case as undefined our implementation 644 // could reasonably *avoid* checking owner in fast_unlock(). 645 // In the interest of performance we elide m->Owner==Self check in unlock. 646 // A perfectly viable alternative is to elide the owner check except when 647 // Xcheck:jni is enabled. 648 649 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 650 assert(boxReg == rax, ""); 651 assert_different_registers(objReg, boxReg, tmpReg); 652 653 Label DONE_LABEL, Stacked, CheckSucc; 654 655 // Critically, the biased locking test must have precedence over 656 // and appear before the (box->dhw == 0) recursive stack-lock test. 657 if (UseBiasedLocking && !UseOptoBiasInlining) { 658 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 659 } 660 661 #if INCLUDE_RTM_OPT 662 if (UseRTMForStackLocks && use_rtm) { 663 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 664 Label L_regular_unlock; 665 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 666 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 667 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 668 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 669 xend(); // otherwise end... 670 jmp(DONE_LABEL); // ... and we're done 671 bind(L_regular_unlock); 672 } 673 #endif 674 675 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 676 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 677 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 678 testptr(tmpReg, markWord::monitor_value); // Inflated? 679 jccb (Assembler::zero, Stacked); 680 681 // It's inflated. 682 #if INCLUDE_RTM_OPT 683 if (use_rtm) { 684 Label L_regular_inflated_unlock; 685 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 686 movptr(boxReg, Address(tmpReg, owner_offset)); 687 testptr(boxReg, boxReg); 688 jccb(Assembler::notZero, L_regular_inflated_unlock); 689 xend(); 690 jmpb(DONE_LABEL); 691 bind(L_regular_inflated_unlock); 692 } 693 #endif 694 695 // Despite our balanced locking property we still check that m->_owner == Self 696 // as java routines or native JNI code called by this thread might 697 // have released the lock. 698 // Refer to the comments in synchronizer.cpp for how we might encode extra 699 // state in _succ so we can avoid fetching EntryList|cxq. 700 // 701 // I'd like to add more cases in fast_lock() and fast_unlock() -- 702 // such as recursive enter and exit -- but we have to be wary of 703 // I$ bloat, T$ effects and BP$ effects. 704 // 705 // If there's no contention try a 1-0 exit. That is, exit without 706 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 707 // we detect and recover from the race that the 1-0 exit admits. 708 // 709 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 710 // before it STs null into _owner, releasing the lock. Updates 711 // to data protected by the critical section must be visible before 712 // we drop the lock (and thus before any other thread could acquire 713 // the lock and observe the fields protected by the lock). 714 // IA32's memory-model is SPO, so STs are ordered with respect to 715 // each other and there's no need for an explicit barrier (fence). 716 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 717 #ifndef _LP64 718 get_thread (boxReg); 719 720 // Note that we could employ various encoding schemes to reduce 721 // the number of loads below (currently 4) to just 2 or 3. 722 // Refer to the comments in synchronizer.cpp. 723 // In practice the chain of fetches doesn't seem to impact performance, however. 724 xorptr(boxReg, boxReg); 725 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 726 jccb (Assembler::notZero, DONE_LABEL); 727 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 728 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 729 jccb (Assembler::notZero, CheckSucc); 730 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 731 jmpb (DONE_LABEL); 732 733 bind (Stacked); 734 // It's not inflated and it's not recursively stack-locked and it's not biased. 735 // It must be stack-locked. 736 // Try to reset the header to displaced header. 737 // The "box" value on the stack is stable, so we can reload 738 // and be assured we observe the same value as above. 739 movptr(tmpReg, Address(boxReg, 0)); 740 lock(); 741 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 742 // Intention fall-thru into DONE_LABEL 743 744 // DONE_LABEL is a hot target - we'd really like to place it at the 745 // start of cache line by padding with NOPs. 746 // See the AMD and Intel software optimization manuals for the 747 // most efficient "long" NOP encodings. 748 // Unfortunately none of our alignment mechanisms suffice. 749 bind (CheckSucc); 750 #else // _LP64 751 // It's inflated 752 xorptr(boxReg, boxReg); 753 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 754 jccb (Assembler::notZero, DONE_LABEL); 755 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 756 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 757 jccb (Assembler::notZero, CheckSucc); 758 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 759 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 760 jmpb (DONE_LABEL); 761 762 // Try to avoid passing control into the slow_path ... 763 Label LSuccess, LGoSlowPath ; 764 bind (CheckSucc); 765 766 // The following optional optimization can be elided if necessary 767 // Effectively: if (succ == null) goto slow path 768 // The code reduces the window for a race, however, 769 // and thus benefits performance. 770 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 771 jccb (Assembler::zero, LGoSlowPath); 772 773 xorptr(boxReg, boxReg); 774 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 775 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 776 777 // Memory barrier/fence 778 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 779 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 780 // This is faster on Nehalem and AMD Shanghai/Barcelona. 781 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 782 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 783 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 784 lock(); addl(Address(rsp, 0), 0); 785 786 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 787 jccb (Assembler::notZero, LSuccess); 788 789 // Rare inopportune interleaving - race. 790 // The successor vanished in the small window above. 791 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 792 // We need to ensure progress and succession. 793 // Try to reacquire the lock. 794 // If that fails then the new owner is responsible for succession and this 795 // thread needs to take no further action and can exit via the fast path (success). 796 // If the re-acquire succeeds then pass control into the slow path. 797 // As implemented, this latter mode is horrible because we generated more 798 // coherence traffic on the lock *and* artifically extended the critical section 799 // length while by virtue of passing control into the slow path. 800 801 // box is really RAX -- the following CMPXCHG depends on that binding 802 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 803 lock(); 804 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 805 // There's no successor so we tried to regrab the lock. 806 // If that didn't work, then another thread grabbed the 807 // lock so we're done (and exit was a success). 808 jccb (Assembler::notEqual, LSuccess); 809 // Intentional fall-through into slow path 810 811 bind (LGoSlowPath); 812 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 813 jmpb (DONE_LABEL); 814 815 bind (LSuccess); 816 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 817 jmpb (DONE_LABEL); 818 819 bind (Stacked); 820 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 821 lock(); 822 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 823 824 #endif 825 bind(DONE_LABEL); 826 } 827 828 //------------------------------------------------------------------------------------------- 829 // Generic instructions support for use in .ad files C2 code generation 830 831 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 832 if (dst != src) { 833 movdqu(dst, src); 834 } 835 if (opcode == Op_AbsVD) { 836 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 837 } else { 838 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 839 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 840 } 841 } 842 843 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 844 if (opcode == Op_AbsVD) { 845 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 846 } else { 847 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 848 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 849 } 850 } 851 852 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 853 if (dst != src) { 854 movdqu(dst, src); 855 } 856 if (opcode == Op_AbsVF) { 857 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 858 } else { 859 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 860 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 861 } 862 } 863 864 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 865 if (opcode == Op_AbsVF) { 866 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 867 } else { 868 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 869 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 870 } 871 } 872 873 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 874 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 875 876 if (opcode == Op_MinV) { 877 if (elem_bt == T_BYTE) { 878 pminsb(dst, src); 879 } else if (elem_bt == T_SHORT) { 880 pminsw(dst, src); 881 } else if (elem_bt == T_INT) { 882 pminsd(dst, src); 883 } else { 884 assert(elem_bt == T_LONG, "required"); 885 assert(tmp == xmm0, "required"); 886 movdqu(xmm0, dst); 887 pcmpgtq(xmm0, src); 888 blendvpd(dst, src); // xmm0 as mask 889 } 890 } else { // opcode == Op_MaxV 891 if (elem_bt == T_BYTE) { 892 pmaxsb(dst, src); 893 } else if (elem_bt == T_SHORT) { 894 pmaxsw(dst, src); 895 } else if (elem_bt == T_INT) { 896 pmaxsd(dst, src); 897 } else { 898 assert(elem_bt == T_LONG, "required"); 899 assert(tmp == xmm0, "required"); 900 movdqu(xmm0, src); 901 pcmpgtq(xmm0, dst); 902 blendvpd(dst, src); // xmm0 as mask 903 } 904 } 905 } 906 907 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 908 XMMRegister dst, XMMRegister src1, XMMRegister src2, 909 int vlen_enc) { 910 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 911 912 if (opcode == Op_MinV) { 913 if (elem_bt == T_BYTE) { 914 vpminsb(dst, src1, src2, vlen_enc); 915 } else if (elem_bt == T_SHORT) { 916 vpminsw(dst, src1, src2, vlen_enc); 917 } else if (elem_bt == T_INT) { 918 vpminsd(dst, src1, src2, vlen_enc); 919 } else { 920 assert(elem_bt == T_LONG, "required"); 921 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 922 vpminsq(dst, src1, src2, vlen_enc); 923 } else { 924 vpcmpgtq(dst, src1, src2, vlen_enc); 925 vblendvpd(dst, src1, src2, dst, vlen_enc); 926 } 927 } 928 } else { // opcode == Op_MaxV 929 if (elem_bt == T_BYTE) { 930 vpmaxsb(dst, src1, src2, vlen_enc); 931 } else if (elem_bt == T_SHORT) { 932 vpmaxsw(dst, src1, src2, vlen_enc); 933 } else if (elem_bt == T_INT) { 934 vpmaxsd(dst, src1, src2, vlen_enc); 935 } else { 936 assert(elem_bt == T_LONG, "required"); 937 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 938 vpmaxsq(dst, src1, src2, vlen_enc); 939 } else { 940 vpcmpgtq(dst, src1, src2, vlen_enc); 941 vblendvpd(dst, src2, src1, dst, vlen_enc); 942 } 943 } 944 } 945 } 946 947 // Float/Double min max 948 949 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 950 XMMRegister dst, XMMRegister a, XMMRegister b, 951 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 952 int vlen_enc) { 953 assert(UseAVX > 0, "required"); 954 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 955 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 956 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 957 958 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 959 bool is_double_word = is_double_word_type(elem_bt); 960 961 if (!is_double_word && is_min) { 962 vblendvps(atmp, a, b, a, vlen_enc); 963 vblendvps(btmp, b, a, a, vlen_enc); 964 vminps(tmp, atmp, btmp, vlen_enc); 965 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 966 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 967 } else if (!is_double_word && !is_min) { 968 vblendvps(btmp, b, a, b, vlen_enc); 969 vblendvps(atmp, a, b, b, vlen_enc); 970 vmaxps(tmp, atmp, btmp, vlen_enc); 971 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 972 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 973 } else if (is_double_word && is_min) { 974 vblendvpd(atmp, a, b, a, vlen_enc); 975 vblendvpd(btmp, b, a, a, vlen_enc); 976 vminpd(tmp, atmp, btmp, vlen_enc); 977 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 978 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 979 } else { 980 assert(is_double_word && !is_min, "sanity"); 981 vblendvpd(btmp, b, a, b, vlen_enc); 982 vblendvpd(atmp, a, b, b, vlen_enc); 983 vmaxpd(tmp, atmp, btmp, vlen_enc); 984 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 985 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 986 } 987 } 988 989 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 990 XMMRegister dst, XMMRegister a, XMMRegister b, 991 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 992 int vlen_enc) { 993 assert(UseAVX > 2, "required"); 994 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 995 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 996 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 997 998 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 999 bool is_double_word = is_double_word_type(elem_bt); 1000 bool merge = true; 1001 1002 if (!is_double_word && is_min) { 1003 evpmovd2m(ktmp, a, vlen_enc); 1004 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1005 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1006 vminps(dst, atmp, btmp, vlen_enc); 1007 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1008 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1009 } else if (!is_double_word && !is_min) { 1010 evpmovd2m(ktmp, b, vlen_enc); 1011 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1012 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1013 vmaxps(dst, atmp, btmp, vlen_enc); 1014 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1015 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1016 } else if (is_double_word && is_min) { 1017 evpmovq2m(ktmp, a, vlen_enc); 1018 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1019 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1020 vminpd(dst, atmp, btmp, vlen_enc); 1021 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1022 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1023 } else { 1024 assert(is_double_word && !is_min, "sanity"); 1025 evpmovq2m(ktmp, b, vlen_enc); 1026 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1027 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1028 vmaxpd(dst, atmp, btmp, vlen_enc); 1029 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1030 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1031 } 1032 } 1033 1034 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1035 if (sign) { 1036 pmovsxbw(dst, src); 1037 } else { 1038 pmovzxbw(dst, src); 1039 } 1040 } 1041 1042 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1043 if (sign) { 1044 vpmovsxbw(dst, src, vector_len); 1045 } else { 1046 vpmovzxbw(dst, src, vector_len); 1047 } 1048 } 1049 1050 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1051 if (sign) { 1052 vpmovsxbd(dst, src, vector_len); 1053 } else { 1054 vpmovzxbd(dst, src, vector_len); 1055 } 1056 } 1057 1058 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1059 if (sign) { 1060 vpmovsxwd(dst, src, vector_len); 1061 } else { 1062 vpmovzxwd(dst, src, vector_len); 1063 } 1064 } 1065 1066 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1067 switch (opcode) { 1068 case Op_RShiftVI: psrad(dst, shift); break; 1069 case Op_LShiftVI: pslld(dst, shift); break; 1070 case Op_URShiftVI: psrld(dst, shift); break; 1071 1072 default: assert(false, "%s", NodeClassNames[opcode]); 1073 } 1074 } 1075 1076 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1077 switch (opcode) { 1078 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1079 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1080 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1081 1082 default: assert(false, "%s", NodeClassNames[opcode]); 1083 } 1084 } 1085 1086 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1087 switch (opcode) { 1088 case Op_RShiftVB: // fall-through 1089 case Op_RShiftVS: psraw(dst, shift); break; 1090 1091 case Op_LShiftVB: // fall-through 1092 case Op_LShiftVS: psllw(dst, shift); break; 1093 1094 case Op_URShiftVS: // fall-through 1095 case Op_URShiftVB: psrlw(dst, shift); break; 1096 1097 default: assert(false, "%s", NodeClassNames[opcode]); 1098 } 1099 } 1100 1101 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1102 switch (opcode) { 1103 case Op_RShiftVB: // fall-through 1104 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1105 1106 case Op_LShiftVB: // fall-through 1107 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1108 1109 case Op_URShiftVS: // fall-through 1110 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1111 1112 default: assert(false, "%s", NodeClassNames[opcode]); 1113 } 1114 } 1115 1116 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1117 switch (opcode) { 1118 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1119 case Op_LShiftVL: psllq(dst, shift); break; 1120 case Op_URShiftVL: psrlq(dst, shift); break; 1121 1122 default: assert(false, "%s", NodeClassNames[opcode]); 1123 } 1124 } 1125 1126 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1127 switch (opcode) { 1128 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1129 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1130 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1131 1132 default: assert(false, "%s", NodeClassNames[opcode]); 1133 } 1134 } 1135 1136 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1137 switch (opcode) { 1138 case Op_VRShiftV: vpsravd(dst, src, shift, vlen_enc); break; 1139 case Op_VLShiftV: vpsllvd(dst, src, shift, vlen_enc); break; 1140 case Op_VURShiftV: vpsrlvd(dst, src, shift, vlen_enc); break; 1141 1142 default: assert(false, "%s", NodeClassNames[opcode]); 1143 } 1144 } 1145 1146 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1147 switch (opcode) { 1148 case Op_VRShiftV: evpsravw(dst, src, shift, vlen_enc); break; 1149 case Op_VLShiftV: evpsllvw(dst, src, shift, vlen_enc); break; 1150 case Op_VURShiftV: evpsrlvw(dst, src, shift, vlen_enc); break; 1151 1152 default: assert(false, "%s", NodeClassNames[opcode]); 1153 } 1154 } 1155 1156 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1157 assert(UseAVX >= 2, "required"); 1158 switch (opcode) { 1159 case Op_VRShiftV: { 1160 if (UseAVX > 2) { 1161 assert(tmp == xnoreg, "not used"); 1162 if (!VM_Version::supports_avx512vl()) { 1163 vlen_enc = Assembler::AVX_512bit; 1164 } 1165 evpsravq(dst, src, shift, vlen_enc); 1166 } else { 1167 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1168 vpsrlvq(dst, src, shift, vlen_enc); 1169 vpsrlvq(tmp, tmp, shift, vlen_enc); 1170 vpxor(dst, dst, tmp, vlen_enc); 1171 vpsubq(dst, dst, tmp, vlen_enc); 1172 } 1173 break; 1174 } 1175 case Op_VLShiftV: { 1176 assert(tmp == xnoreg, "not used"); 1177 vpsllvq(dst, src, shift, vlen_enc); 1178 break; 1179 } 1180 case Op_VURShiftV: { 1181 assert(tmp == xnoreg, "not used"); 1182 vpsrlvq(dst, src, shift, vlen_enc); 1183 break; 1184 } 1185 default: assert(false, "%s", NodeClassNames[opcode]); 1186 } 1187 } 1188 1189 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1190 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1191 bool sign = (opcode == Op_VURShiftV) ? false : true; 1192 assert(vector_len == 0, "required"); 1193 vextendbd(sign, dst, src, 1); 1194 vpmovzxbd(vtmp, shift, 1); 1195 varshiftd(opcode, dst, dst, vtmp, 1); 1196 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1197 vextracti128_high(vtmp, dst); 1198 vpackusdw(dst, dst, vtmp, 0); 1199 } 1200 1201 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1202 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1203 bool sign = (opcode == Op_VURShiftV) ? false : true; 1204 int ext_vector_len = vector_len + 1; 1205 vextendbw(sign, dst, src, ext_vector_len); 1206 vpmovzxbw(vtmp, shift, ext_vector_len); 1207 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1208 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1209 if (vector_len == 0) { 1210 vextracti128_high(vtmp, dst); 1211 vpackuswb(dst, dst, vtmp, vector_len); 1212 } else { 1213 vextracti64x4_high(vtmp, dst); 1214 vpackuswb(dst, dst, vtmp, vector_len); 1215 vpermq(dst, dst, 0xD8, vector_len); 1216 } 1217 } 1218 1219 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1220 switch(typ) { 1221 case T_BYTE: 1222 pinsrb(dst, val, idx); 1223 break; 1224 case T_SHORT: 1225 pinsrw(dst, val, idx); 1226 break; 1227 case T_INT: 1228 pinsrd(dst, val, idx); 1229 break; 1230 case T_LONG: 1231 pinsrq(dst, val, idx); 1232 break; 1233 default: 1234 assert(false,"Should not reach here."); 1235 break; 1236 } 1237 } 1238 1239 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1240 switch(typ) { 1241 case T_BYTE: 1242 vpinsrb(dst, src, val, idx); 1243 break; 1244 case T_SHORT: 1245 vpinsrw(dst, src, val, idx); 1246 break; 1247 case T_INT: 1248 vpinsrd(dst, src, val, idx); 1249 break; 1250 case T_LONG: 1251 vpinsrq(dst, src, val, idx); 1252 break; 1253 default: 1254 assert(false,"Should not reach here."); 1255 break; 1256 } 1257 } 1258 1259 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1260 switch(typ) { 1261 case T_INT: 1262 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1263 break; 1264 case T_FLOAT: 1265 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1266 break; 1267 case T_LONG: 1268 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1269 break; 1270 case T_DOUBLE: 1271 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1272 break; 1273 default: 1274 assert(false,"Should not reach here."); 1275 break; 1276 } 1277 } 1278 1279 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1280 switch(typ) { 1281 case T_INT: 1282 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1283 break; 1284 case T_FLOAT: 1285 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1286 break; 1287 case T_LONG: 1288 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1289 break; 1290 case T_DOUBLE: 1291 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1292 break; 1293 default: 1294 assert(false,"Should not reach here."); 1295 break; 1296 } 1297 } 1298 1299 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1300 switch(typ) { 1301 case T_INT: 1302 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1303 break; 1304 case T_FLOAT: 1305 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1306 break; 1307 case T_LONG: 1308 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1309 break; 1310 case T_DOUBLE: 1311 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1312 break; 1313 default: 1314 assert(false,"Should not reach here."); 1315 break; 1316 } 1317 } 1318 1319 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { 1320 if (vlen_in_bytes <= 16) { 1321 pxor (dst, dst); 1322 psubb(dst, src); 1323 switch (elem_bt) { 1324 case T_BYTE: /* nothing to do */ break; 1325 case T_SHORT: pmovsxbw(dst, dst); break; 1326 case T_INT: pmovsxbd(dst, dst); break; 1327 case T_FLOAT: pmovsxbd(dst, dst); break; 1328 case T_LONG: pmovsxbq(dst, dst); break; 1329 case T_DOUBLE: pmovsxbq(dst, dst); break; 1330 1331 default: assert(false, "%s", type2name(elem_bt)); 1332 } 1333 } else { 1334 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1335 1336 vpxor (dst, dst, dst, vlen_enc); 1337 vpsubb(dst, dst, src, vlen_enc); 1338 switch (elem_bt) { 1339 case T_BYTE: /* nothing to do */ break; 1340 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1341 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1342 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1343 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1344 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1345 1346 default: assert(false, "%s", type2name(elem_bt)); 1347 } 1348 } 1349 } 1350 1351 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1352 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1353 if (vlen_in_bytes <= 16) { 1354 movdqu(dst, addr, scratch); 1355 } else if (vlen_in_bytes == 32) { 1356 vmovdqu(dst, addr, scratch); 1357 } else { 1358 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1359 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1360 } 1361 } 1362 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1363 1364 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1365 int vector_len = Assembler::AVX_128bit; 1366 1367 switch (opcode) { 1368 case Op_AndReductionV: pand(dst, src); break; 1369 case Op_OrReductionV: por (dst, src); break; 1370 case Op_XorReductionV: pxor(dst, src); break; 1371 case Op_MinReductionV: 1372 switch (typ) { 1373 case T_BYTE: pminsb(dst, src); break; 1374 case T_SHORT: pminsw(dst, src); break; 1375 case T_INT: pminsd(dst, src); break; 1376 case T_LONG: assert(UseAVX > 2, "required"); 1377 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1378 default: assert(false, "wrong type"); 1379 } 1380 break; 1381 case Op_MaxReductionV: 1382 switch (typ) { 1383 case T_BYTE: pmaxsb(dst, src); break; 1384 case T_SHORT: pmaxsw(dst, src); break; 1385 case T_INT: pmaxsd(dst, src); break; 1386 case T_LONG: assert(UseAVX > 2, "required"); 1387 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1388 default: assert(false, "wrong type"); 1389 } 1390 break; 1391 case Op_AddReductionVF: addss(dst, src); break; 1392 case Op_AddReductionVD: addsd(dst, src); break; 1393 case Op_AddReductionVI: 1394 switch (typ) { 1395 case T_BYTE: paddb(dst, src); break; 1396 case T_SHORT: paddw(dst, src); break; 1397 case T_INT: paddd(dst, src); break; 1398 default: assert(false, "wrong type"); 1399 } 1400 break; 1401 case Op_AddReductionVL: paddq(dst, src); break; 1402 case Op_MulReductionVF: mulss(dst, src); break; 1403 case Op_MulReductionVD: mulsd(dst, src); break; 1404 case Op_MulReductionVI: 1405 switch (typ) { 1406 case T_SHORT: pmullw(dst, src); break; 1407 case T_INT: pmulld(dst, src); break; 1408 default: assert(false, "wrong type"); 1409 } 1410 break; 1411 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1412 vpmullq(dst, dst, src, vector_len); break; 1413 default: assert(false, "wrong opcode"); 1414 } 1415 } 1416 1417 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1418 int vector_len = Assembler::AVX_256bit; 1419 1420 switch (opcode) { 1421 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1422 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1423 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1424 case Op_MinReductionV: 1425 switch (typ) { 1426 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1427 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1428 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1429 case T_LONG: assert(UseAVX > 2, "required"); 1430 vpminsq(dst, src1, src2, vector_len); break; 1431 default: assert(false, "wrong type"); 1432 } 1433 break; 1434 case Op_MaxReductionV: 1435 switch (typ) { 1436 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1437 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1438 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1439 case T_LONG: assert(UseAVX > 2, "required"); 1440 vpmaxsq(dst, src1, src2, vector_len); break; 1441 default: assert(false, "wrong type"); 1442 } 1443 break; 1444 case Op_AddReductionVI: 1445 switch (typ) { 1446 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1447 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1448 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1449 default: assert(false, "wrong type"); 1450 } 1451 break; 1452 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1453 case Op_MulReductionVI: 1454 switch (typ) { 1455 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1456 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1457 default: assert(false, "wrong type"); 1458 } 1459 break; 1460 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1461 default: assert(false, "wrong opcode"); 1462 } 1463 } 1464 1465 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1466 XMMRegister dst, XMMRegister src, 1467 XMMRegister vtmp1, XMMRegister vtmp2) { 1468 switch (opcode) { 1469 case Op_AddReductionVF: 1470 case Op_MulReductionVF: 1471 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1472 break; 1473 1474 case Op_AddReductionVD: 1475 case Op_MulReductionVD: 1476 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1477 break; 1478 1479 default: assert(false, "wrong opcode"); 1480 } 1481 } 1482 1483 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1484 Register dst, Register src1, XMMRegister src2, 1485 XMMRegister vtmp1, XMMRegister vtmp2) { 1486 switch (vlen) { 1487 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1488 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1489 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1490 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1491 1492 default: assert(false, "wrong vector length"); 1493 } 1494 } 1495 1496 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1497 Register dst, Register src1, XMMRegister src2, 1498 XMMRegister vtmp1, XMMRegister vtmp2) { 1499 switch (vlen) { 1500 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1501 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1502 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1503 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1504 1505 default: assert(false, "wrong vector length"); 1506 } 1507 } 1508 1509 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1510 Register dst, Register src1, XMMRegister src2, 1511 XMMRegister vtmp1, XMMRegister vtmp2) { 1512 switch (vlen) { 1513 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1514 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1515 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1516 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1517 1518 default: assert(false, "wrong vector length"); 1519 } 1520 } 1521 1522 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1523 Register dst, Register src1, XMMRegister src2, 1524 XMMRegister vtmp1, XMMRegister vtmp2) { 1525 switch (vlen) { 1526 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1527 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1528 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1529 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1530 1531 default: assert(false, "wrong vector length"); 1532 } 1533 } 1534 1535 #ifdef _LP64 1536 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1537 Register dst, Register src1, XMMRegister src2, 1538 XMMRegister vtmp1, XMMRegister vtmp2) { 1539 switch (vlen) { 1540 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1541 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1542 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1543 1544 default: assert(false, "wrong vector length"); 1545 } 1546 } 1547 #endif // _LP64 1548 1549 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1550 switch (vlen) { 1551 case 2: 1552 assert(vtmp2 == xnoreg, ""); 1553 reduce2F(opcode, dst, src, vtmp1); 1554 break; 1555 case 4: 1556 assert(vtmp2 == xnoreg, ""); 1557 reduce4F(opcode, dst, src, vtmp1); 1558 break; 1559 case 8: 1560 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1561 break; 1562 case 16: 1563 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1564 break; 1565 default: assert(false, "wrong vector length"); 1566 } 1567 } 1568 1569 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1570 switch (vlen) { 1571 case 2: 1572 assert(vtmp2 == xnoreg, ""); 1573 reduce2D(opcode, dst, src, vtmp1); 1574 break; 1575 case 4: 1576 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1577 break; 1578 case 8: 1579 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1580 break; 1581 default: assert(false, "wrong vector length"); 1582 } 1583 } 1584 1585 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1586 if (opcode == Op_AddReductionVI) { 1587 if (vtmp1 != src2) { 1588 movdqu(vtmp1, src2); 1589 } 1590 phaddd(vtmp1, vtmp1); 1591 } else { 1592 pshufd(vtmp1, src2, 0x1); 1593 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1594 } 1595 movdl(vtmp2, src1); 1596 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1597 movdl(dst, vtmp1); 1598 } 1599 1600 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1601 if (opcode == Op_AddReductionVI) { 1602 if (vtmp1 != src2) { 1603 movdqu(vtmp1, src2); 1604 } 1605 phaddd(vtmp1, src2); 1606 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1607 } else { 1608 pshufd(vtmp2, src2, 0xE); 1609 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1610 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1611 } 1612 } 1613 1614 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1615 if (opcode == Op_AddReductionVI) { 1616 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1617 vextracti128_high(vtmp2, vtmp1); 1618 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1619 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1620 } else { 1621 vextracti128_high(vtmp1, src2); 1622 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1623 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1624 } 1625 } 1626 1627 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1628 vextracti64x4_high(vtmp2, src2); 1629 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1630 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1631 } 1632 1633 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1634 pshufd(vtmp2, src2, 0x1); 1635 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1636 movdqu(vtmp1, vtmp2); 1637 psrldq(vtmp1, 2); 1638 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1639 movdqu(vtmp2, vtmp1); 1640 psrldq(vtmp2, 1); 1641 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1642 movdl(vtmp2, src1); 1643 pmovsxbd(vtmp1, vtmp1); 1644 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1645 pextrb(dst, vtmp1, 0x0); 1646 movsbl(dst, dst); 1647 } 1648 1649 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1650 pshufd(vtmp1, src2, 0xE); 1651 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1652 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1653 } 1654 1655 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1656 vextracti128_high(vtmp2, src2); 1657 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1658 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1659 } 1660 1661 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1662 vextracti64x4_high(vtmp1, src2); 1663 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1664 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1665 } 1666 1667 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1668 pmovsxbw(vtmp2, src2); 1669 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1670 } 1671 1672 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1673 if (UseAVX > 1) { 1674 int vector_len = Assembler::AVX_256bit; 1675 vpmovsxbw(vtmp1, src2, vector_len); 1676 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1677 } else { 1678 pmovsxbw(vtmp2, src2); 1679 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1680 pshufd(vtmp2, src2, 0x1); 1681 pmovsxbw(vtmp2, src2); 1682 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1683 } 1684 } 1685 1686 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1687 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1688 int vector_len = Assembler::AVX_512bit; 1689 vpmovsxbw(vtmp1, src2, vector_len); 1690 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1691 } else { 1692 assert(UseAVX >= 2,"Should not reach here."); 1693 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1694 vextracti128_high(vtmp2, src2); 1695 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1696 } 1697 } 1698 1699 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1700 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1701 vextracti64x4_high(vtmp2, src2); 1702 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1703 } 1704 1705 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1706 if (opcode == Op_AddReductionVI) { 1707 if (vtmp1 != src2) { 1708 movdqu(vtmp1, src2); 1709 } 1710 phaddw(vtmp1, vtmp1); 1711 phaddw(vtmp1, vtmp1); 1712 } else { 1713 pshufd(vtmp2, src2, 0x1); 1714 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1715 movdqu(vtmp1, vtmp2); 1716 psrldq(vtmp1, 2); 1717 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1718 } 1719 movdl(vtmp2, src1); 1720 pmovsxwd(vtmp1, vtmp1); 1721 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1722 pextrw(dst, vtmp1, 0x0); 1723 movswl(dst, dst); 1724 } 1725 1726 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1727 if (opcode == Op_AddReductionVI) { 1728 if (vtmp1 != src2) { 1729 movdqu(vtmp1, src2); 1730 } 1731 phaddw(vtmp1, src2); 1732 } else { 1733 pshufd(vtmp1, src2, 0xE); 1734 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1735 } 1736 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1737 } 1738 1739 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1740 if (opcode == Op_AddReductionVI) { 1741 int vector_len = Assembler::AVX_256bit; 1742 vphaddw(vtmp2, src2, src2, vector_len); 1743 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1744 } else { 1745 vextracti128_high(vtmp2, src2); 1746 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1747 } 1748 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1749 } 1750 1751 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1752 int vector_len = Assembler::AVX_256bit; 1753 vextracti64x4_high(vtmp1, src2); 1754 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1755 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1756 } 1757 1758 #ifdef _LP64 1759 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1760 pshufd(vtmp2, src2, 0xE); 1761 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1762 movdq(vtmp1, src1); 1763 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1764 movdq(dst, vtmp1); 1765 } 1766 1767 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1768 vextracti128_high(vtmp1, src2); 1769 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1770 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1771 } 1772 1773 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1774 vextracti64x4_high(vtmp2, src2); 1775 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1776 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1777 } 1778 #endif // _LP64 1779 1780 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1781 reduce_operation_128(T_FLOAT, opcode, dst, src); 1782 pshufd(vtmp, src, 0x1); 1783 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1784 } 1785 1786 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1787 reduce2F(opcode, dst, src, vtmp); 1788 pshufd(vtmp, src, 0x2); 1789 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1790 pshufd(vtmp, src, 0x3); 1791 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1792 } 1793 1794 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1795 reduce4F(opcode, dst, src, vtmp2); 1796 vextractf128_high(vtmp2, src); 1797 reduce4F(opcode, dst, vtmp2, vtmp1); 1798 } 1799 1800 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1801 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1802 vextracti64x4_high(vtmp1, src); 1803 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1804 } 1805 1806 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1807 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1808 pshufd(vtmp, src, 0xE); 1809 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1810 } 1811 1812 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1813 reduce2D(opcode, dst, src, vtmp2); 1814 vextractf128_high(vtmp2, src); 1815 reduce2D(opcode, dst, vtmp2, vtmp1); 1816 } 1817 1818 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1819 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1820 vextracti64x4_high(vtmp1, src); 1821 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1822 } 1823 1824 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 1825 XMMRegister dst, XMMRegister src, 1826 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1827 XMMRegister xmm_0, XMMRegister xmm_1) { 1828 int permconst[] = {1, 14}; 1829 XMMRegister wsrc = src; 1830 XMMRegister wdst = xmm_0; 1831 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1832 1833 int vlen_enc = Assembler::AVX_128bit; 1834 if (vlen == 16) { 1835 vlen_enc = Assembler::AVX_256bit; 1836 } 1837 1838 for (int i = log2(vlen) - 1; i >=0; i--) { 1839 if (i == 0 && !is_dst_valid) { 1840 wdst = dst; 1841 } 1842 if (i == 3) { 1843 vextracti64x4_high(wtmp, wsrc); 1844 } else if (i == 2) { 1845 vextracti128_high(wtmp, wsrc); 1846 } else { // i = [0,1] 1847 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 1848 } 1849 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1850 wsrc = wdst; 1851 vlen_enc = Assembler::AVX_128bit; 1852 } 1853 if (is_dst_valid) { 1854 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1855 } 1856 } 1857 1858 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 1859 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1860 XMMRegister xmm_0, XMMRegister xmm_1) { 1861 XMMRegister wsrc = src; 1862 XMMRegister wdst = xmm_0; 1863 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1864 int vlen_enc = Assembler::AVX_128bit; 1865 if (vlen == 8) { 1866 vlen_enc = Assembler::AVX_256bit; 1867 } 1868 for (int i = log2(vlen) - 1; i >=0; i--) { 1869 if (i == 0 && !is_dst_valid) { 1870 wdst = dst; 1871 } 1872 if (i == 1) { 1873 vextracti128_high(wtmp, wsrc); 1874 } else if (i == 2) { 1875 vextracti64x4_high(wtmp, wsrc); 1876 } else { 1877 assert(i == 0, "%d", i); 1878 vpermilpd(wtmp, wsrc, 1, vlen_enc); 1879 } 1880 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1881 wsrc = wdst; 1882 vlen_enc = Assembler::AVX_128bit; 1883 } 1884 if (is_dst_valid) { 1885 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1886 } 1887 } 1888 1889 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 1890 switch (bt) { 1891 case T_BYTE: pextrb(dst, src, idx); break; 1892 case T_SHORT: pextrw(dst, src, idx); break; 1893 case T_INT: pextrd(dst, src, idx); break; 1894 case T_LONG: pextrq(dst, src, idx); break; 1895 1896 default: 1897 assert(false,"Should not reach here."); 1898 break; 1899 } 1900 } 1901 1902 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 1903 int esize = type2aelembytes(typ); 1904 int elem_per_lane = 16/esize; 1905 int lane = elemindex / elem_per_lane; 1906 int eindex = elemindex % elem_per_lane; 1907 1908 if (lane >= 2) { 1909 assert(UseAVX > 2, "required"); 1910 vextractf32x4(dst, src, lane & 3); 1911 return dst; 1912 } else if (lane > 0) { 1913 assert(UseAVX > 0, "required"); 1914 vextractf128(dst, src, lane); 1915 return dst; 1916 } else { 1917 return src; 1918 } 1919 } 1920 1921 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 1922 int esize = type2aelembytes(typ); 1923 int elem_per_lane = 16/esize; 1924 int eindex = elemindex % elem_per_lane; 1925 assert(is_integral_type(typ),"required"); 1926 1927 if (eindex == 0) { 1928 if (typ == T_LONG) { 1929 movq(dst, src); 1930 } else { 1931 movdl(dst, src); 1932 if (typ == T_BYTE) 1933 movsbl(dst, dst); 1934 else if (typ == T_SHORT) 1935 movswl(dst, dst); 1936 } 1937 } else { 1938 extract(typ, dst, src, eindex); 1939 } 1940 } 1941 1942 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 1943 int esize = type2aelembytes(typ); 1944 int elem_per_lane = 16/esize; 1945 int eindex = elemindex % elem_per_lane; 1946 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 1947 1948 if (eindex == 0) { 1949 movq(dst, src); 1950 } else { 1951 if (typ == T_FLOAT) { 1952 if (UseAVX == 0) { 1953 movdqu(dst, src); 1954 pshufps(dst, dst, eindex); 1955 } else { 1956 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 1957 } 1958 } else { 1959 if (UseAVX == 0) { 1960 movdqu(dst, src); 1961 psrldq(dst, eindex*esize); 1962 } else { 1963 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 1964 } 1965 movq(dst, dst); 1966 } 1967 } 1968 // Zero upper bits 1969 if (typ == T_FLOAT) { 1970 if (UseAVX == 0) { 1971 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 1972 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 1973 pand(dst, vtmp); 1974 } else { 1975 assert((tmp != noreg), "required."); 1976 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 1977 } 1978 } 1979 } 1980 1981 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 1982 switch(typ) { 1983 case T_BYTE: 1984 evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1985 break; 1986 case T_SHORT: 1987 evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1988 break; 1989 case T_INT: 1990 case T_FLOAT: 1991 evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1992 break; 1993 case T_LONG: 1994 case T_DOUBLE: 1995 evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 1996 break; 1997 default: 1998 assert(false,"Should not reach here."); 1999 break; 2000 } 2001 } 2002 2003 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2004 switch(typ) { 2005 case T_BYTE: 2006 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2007 break; 2008 case T_SHORT: 2009 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2010 break; 2011 case T_INT: 2012 case T_FLOAT: 2013 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2014 break; 2015 case T_LONG: 2016 case T_DOUBLE: 2017 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2018 break; 2019 default: 2020 assert(false,"Should not reach here."); 2021 break; 2022 } 2023 } 2024 2025 //------------------------------------------------------------------------------------------- 2026 2027 // IndexOf for constant substrings with size >= 8 chars 2028 // which don't need to be loaded through stack. 2029 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2030 Register cnt1, Register cnt2, 2031 int int_cnt2, Register result, 2032 XMMRegister vec, Register tmp, 2033 int ae) { 2034 ShortBranchVerifier sbv(this); 2035 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2036 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2037 2038 // This method uses the pcmpestri instruction with bound registers 2039 // inputs: 2040 // xmm - substring 2041 // rax - substring length (elements count) 2042 // mem - scanned string 2043 // rdx - string length (elements count) 2044 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2045 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2046 // outputs: 2047 // rcx - matched index in string 2048 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2049 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2050 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2051 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2052 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2053 2054 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2055 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2056 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2057 2058 // Note, inline_string_indexOf() generates checks: 2059 // if (substr.count > string.count) return -1; 2060 // if (substr.count == 0) return 0; 2061 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2062 2063 // Load substring. 2064 if (ae == StrIntrinsicNode::UL) { 2065 pmovzxbw(vec, Address(str2, 0)); 2066 } else { 2067 movdqu(vec, Address(str2, 0)); 2068 } 2069 movl(cnt2, int_cnt2); 2070 movptr(result, str1); // string addr 2071 2072 if (int_cnt2 > stride) { 2073 jmpb(SCAN_TO_SUBSTR); 2074 2075 // Reload substr for rescan, this code 2076 // is executed only for large substrings (> 8 chars) 2077 bind(RELOAD_SUBSTR); 2078 if (ae == StrIntrinsicNode::UL) { 2079 pmovzxbw(vec, Address(str2, 0)); 2080 } else { 2081 movdqu(vec, Address(str2, 0)); 2082 } 2083 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2084 2085 bind(RELOAD_STR); 2086 // We came here after the beginning of the substring was 2087 // matched but the rest of it was not so we need to search 2088 // again. Start from the next element after the previous match. 2089 2090 // cnt2 is number of substring reminding elements and 2091 // cnt1 is number of string reminding elements when cmp failed. 2092 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2093 subl(cnt1, cnt2); 2094 addl(cnt1, int_cnt2); 2095 movl(cnt2, int_cnt2); // Now restore cnt2 2096 2097 decrementl(cnt1); // Shift to next element 2098 cmpl(cnt1, cnt2); 2099 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2100 2101 addptr(result, (1<<scale1)); 2102 2103 } // (int_cnt2 > 8) 2104 2105 // Scan string for start of substr in 16-byte vectors 2106 bind(SCAN_TO_SUBSTR); 2107 pcmpestri(vec, Address(result, 0), mode); 2108 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2109 subl(cnt1, stride); 2110 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2111 cmpl(cnt1, cnt2); 2112 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2113 addptr(result, 16); 2114 jmpb(SCAN_TO_SUBSTR); 2115 2116 // Found a potential substr 2117 bind(FOUND_CANDIDATE); 2118 // Matched whole vector if first element matched (tmp(rcx) == 0). 2119 if (int_cnt2 == stride) { 2120 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2121 } else { // int_cnt2 > 8 2122 jccb(Assembler::overflow, FOUND_SUBSTR); 2123 } 2124 // After pcmpestri tmp(rcx) contains matched element index 2125 // Compute start addr of substr 2126 lea(result, Address(result, tmp, scale1)); 2127 2128 // Make sure string is still long enough 2129 subl(cnt1, tmp); 2130 cmpl(cnt1, cnt2); 2131 if (int_cnt2 == stride) { 2132 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2133 } else { // int_cnt2 > 8 2134 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2135 } 2136 // Left less then substring. 2137 2138 bind(RET_NOT_FOUND); 2139 movl(result, -1); 2140 jmp(EXIT); 2141 2142 if (int_cnt2 > stride) { 2143 // This code is optimized for the case when whole substring 2144 // is matched if its head is matched. 2145 bind(MATCH_SUBSTR_HEAD); 2146 pcmpestri(vec, Address(result, 0), mode); 2147 // Reload only string if does not match 2148 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2149 2150 Label CONT_SCAN_SUBSTR; 2151 // Compare the rest of substring (> 8 chars). 2152 bind(FOUND_SUBSTR); 2153 // First 8 chars are already matched. 2154 negptr(cnt2); 2155 addptr(cnt2, stride); 2156 2157 bind(SCAN_SUBSTR); 2158 subl(cnt1, stride); 2159 cmpl(cnt2, -stride); // Do not read beyond substring 2160 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2161 // Back-up strings to avoid reading beyond substring: 2162 // cnt1 = cnt1 - cnt2 + 8 2163 addl(cnt1, cnt2); // cnt2 is negative 2164 addl(cnt1, stride); 2165 movl(cnt2, stride); negptr(cnt2); 2166 bind(CONT_SCAN_SUBSTR); 2167 if (int_cnt2 < (int)G) { 2168 int tail_off1 = int_cnt2<<scale1; 2169 int tail_off2 = int_cnt2<<scale2; 2170 if (ae == StrIntrinsicNode::UL) { 2171 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2172 } else { 2173 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2174 } 2175 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2176 } else { 2177 // calculate index in register to avoid integer overflow (int_cnt2*2) 2178 movl(tmp, int_cnt2); 2179 addptr(tmp, cnt2); 2180 if (ae == StrIntrinsicNode::UL) { 2181 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2182 } else { 2183 movdqu(vec, Address(str2, tmp, scale2, 0)); 2184 } 2185 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2186 } 2187 // Need to reload strings pointers if not matched whole vector 2188 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2189 addptr(cnt2, stride); 2190 jcc(Assembler::negative, SCAN_SUBSTR); 2191 // Fall through if found full substring 2192 2193 } // (int_cnt2 > 8) 2194 2195 bind(RET_FOUND); 2196 // Found result if we matched full small substring. 2197 // Compute substr offset 2198 subptr(result, str1); 2199 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2200 shrl(result, 1); // index 2201 } 2202 bind(EXIT); 2203 2204 } // string_indexofC8 2205 2206 // Small strings are loaded through stack if they cross page boundary. 2207 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2208 Register cnt1, Register cnt2, 2209 int int_cnt2, Register result, 2210 XMMRegister vec, Register tmp, 2211 int ae) { 2212 ShortBranchVerifier sbv(this); 2213 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2214 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2215 2216 // 2217 // int_cnt2 is length of small (< 8 chars) constant substring 2218 // or (-1) for non constant substring in which case its length 2219 // is in cnt2 register. 2220 // 2221 // Note, inline_string_indexOf() generates checks: 2222 // if (substr.count > string.count) return -1; 2223 // if (substr.count == 0) return 0; 2224 // 2225 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2226 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2227 // This method uses the pcmpestri instruction with bound registers 2228 // inputs: 2229 // xmm - substring 2230 // rax - substring length (elements count) 2231 // mem - scanned string 2232 // rdx - string length (elements count) 2233 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2234 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2235 // outputs: 2236 // rcx - matched index in string 2237 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2238 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2239 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2240 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2241 2242 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2243 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2244 FOUND_CANDIDATE; 2245 2246 { //======================================================== 2247 // We don't know where these strings are located 2248 // and we can't read beyond them. Load them through stack. 2249 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2250 2251 movptr(tmp, rsp); // save old SP 2252 2253 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2254 if (int_cnt2 == (1>>scale2)) { // One byte 2255 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2256 load_unsigned_byte(result, Address(str2, 0)); 2257 movdl(vec, result); // move 32 bits 2258 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2259 // Not enough header space in 32-bit VM: 12+3 = 15. 2260 movl(result, Address(str2, -1)); 2261 shrl(result, 8); 2262 movdl(vec, result); // move 32 bits 2263 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2264 load_unsigned_short(result, Address(str2, 0)); 2265 movdl(vec, result); // move 32 bits 2266 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2267 movdl(vec, Address(str2, 0)); // move 32 bits 2268 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2269 movq(vec, Address(str2, 0)); // move 64 bits 2270 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2271 // Array header size is 12 bytes in 32-bit VM 2272 // + 6 bytes for 3 chars == 18 bytes, 2273 // enough space to load vec and shift. 2274 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2275 if (ae == StrIntrinsicNode::UL) { 2276 int tail_off = int_cnt2-8; 2277 pmovzxbw(vec, Address(str2, tail_off)); 2278 psrldq(vec, -2*tail_off); 2279 } 2280 else { 2281 int tail_off = int_cnt2*(1<<scale2); 2282 movdqu(vec, Address(str2, tail_off-16)); 2283 psrldq(vec, 16-tail_off); 2284 } 2285 } 2286 } else { // not constant substring 2287 cmpl(cnt2, stride); 2288 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2289 2290 // We can read beyond string if srt+16 does not cross page boundary 2291 // since heaps are aligned and mapped by pages. 2292 assert(os::vm_page_size() < (int)G, "default page should be small"); 2293 movl(result, str2); // We need only low 32 bits 2294 andl(result, (os::vm_page_size()-1)); 2295 cmpl(result, (os::vm_page_size()-16)); 2296 jccb(Assembler::belowEqual, CHECK_STR); 2297 2298 // Move small strings to stack to allow load 16 bytes into vec. 2299 subptr(rsp, 16); 2300 int stk_offset = wordSize-(1<<scale2); 2301 push(cnt2); 2302 2303 bind(COPY_SUBSTR); 2304 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2305 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2306 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2307 } else if (ae == StrIntrinsicNode::UU) { 2308 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2309 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2310 } 2311 decrement(cnt2); 2312 jccb(Assembler::notZero, COPY_SUBSTR); 2313 2314 pop(cnt2); 2315 movptr(str2, rsp); // New substring address 2316 } // non constant 2317 2318 bind(CHECK_STR); 2319 cmpl(cnt1, stride); 2320 jccb(Assembler::aboveEqual, BIG_STRINGS); 2321 2322 // Check cross page boundary. 2323 movl(result, str1); // We need only low 32 bits 2324 andl(result, (os::vm_page_size()-1)); 2325 cmpl(result, (os::vm_page_size()-16)); 2326 jccb(Assembler::belowEqual, BIG_STRINGS); 2327 2328 subptr(rsp, 16); 2329 int stk_offset = -(1<<scale1); 2330 if (int_cnt2 < 0) { // not constant 2331 push(cnt2); 2332 stk_offset += wordSize; 2333 } 2334 movl(cnt2, cnt1); 2335 2336 bind(COPY_STR); 2337 if (ae == StrIntrinsicNode::LL) { 2338 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2339 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2340 } else { 2341 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2342 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2343 } 2344 decrement(cnt2); 2345 jccb(Assembler::notZero, COPY_STR); 2346 2347 if (int_cnt2 < 0) { // not constant 2348 pop(cnt2); 2349 } 2350 movptr(str1, rsp); // New string address 2351 2352 bind(BIG_STRINGS); 2353 // Load substring. 2354 if (int_cnt2 < 0) { // -1 2355 if (ae == StrIntrinsicNode::UL) { 2356 pmovzxbw(vec, Address(str2, 0)); 2357 } else { 2358 movdqu(vec, Address(str2, 0)); 2359 } 2360 push(cnt2); // substr count 2361 push(str2); // substr addr 2362 push(str1); // string addr 2363 } else { 2364 // Small (< 8 chars) constant substrings are loaded already. 2365 movl(cnt2, int_cnt2); 2366 } 2367 push(tmp); // original SP 2368 2369 } // Finished loading 2370 2371 //======================================================== 2372 // Start search 2373 // 2374 2375 movptr(result, str1); // string addr 2376 2377 if (int_cnt2 < 0) { // Only for non constant substring 2378 jmpb(SCAN_TO_SUBSTR); 2379 2380 // SP saved at sp+0 2381 // String saved at sp+1*wordSize 2382 // Substr saved at sp+2*wordSize 2383 // Substr count saved at sp+3*wordSize 2384 2385 // Reload substr for rescan, this code 2386 // is executed only for large substrings (> 8 chars) 2387 bind(RELOAD_SUBSTR); 2388 movptr(str2, Address(rsp, 2*wordSize)); 2389 movl(cnt2, Address(rsp, 3*wordSize)); 2390 if (ae == StrIntrinsicNode::UL) { 2391 pmovzxbw(vec, Address(str2, 0)); 2392 } else { 2393 movdqu(vec, Address(str2, 0)); 2394 } 2395 // We came here after the beginning of the substring was 2396 // matched but the rest of it was not so we need to search 2397 // again. Start from the next element after the previous match. 2398 subptr(str1, result); // Restore counter 2399 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2400 shrl(str1, 1); 2401 } 2402 addl(cnt1, str1); 2403 decrementl(cnt1); // Shift to next element 2404 cmpl(cnt1, cnt2); 2405 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2406 2407 addptr(result, (1<<scale1)); 2408 } // non constant 2409 2410 // Scan string for start of substr in 16-byte vectors 2411 bind(SCAN_TO_SUBSTR); 2412 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2413 pcmpestri(vec, Address(result, 0), mode); 2414 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2415 subl(cnt1, stride); 2416 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2417 cmpl(cnt1, cnt2); 2418 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2419 addptr(result, 16); 2420 2421 bind(ADJUST_STR); 2422 cmpl(cnt1, stride); // Do not read beyond string 2423 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2424 // Back-up string to avoid reading beyond string. 2425 lea(result, Address(result, cnt1, scale1, -16)); 2426 movl(cnt1, stride); 2427 jmpb(SCAN_TO_SUBSTR); 2428 2429 // Found a potential substr 2430 bind(FOUND_CANDIDATE); 2431 // After pcmpestri tmp(rcx) contains matched element index 2432 2433 // Make sure string is still long enough 2434 subl(cnt1, tmp); 2435 cmpl(cnt1, cnt2); 2436 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2437 // Left less then substring. 2438 2439 bind(RET_NOT_FOUND); 2440 movl(result, -1); 2441 jmp(CLEANUP); 2442 2443 bind(FOUND_SUBSTR); 2444 // Compute start addr of substr 2445 lea(result, Address(result, tmp, scale1)); 2446 if (int_cnt2 > 0) { // Constant substring 2447 // Repeat search for small substring (< 8 chars) 2448 // from new point without reloading substring. 2449 // Have to check that we don't read beyond string. 2450 cmpl(tmp, stride-int_cnt2); 2451 jccb(Assembler::greater, ADJUST_STR); 2452 // Fall through if matched whole substring. 2453 } else { // non constant 2454 assert(int_cnt2 == -1, "should be != 0"); 2455 2456 addl(tmp, cnt2); 2457 // Found result if we matched whole substring. 2458 cmpl(tmp, stride); 2459 jcc(Assembler::lessEqual, RET_FOUND); 2460 2461 // Repeat search for small substring (<= 8 chars) 2462 // from new point 'str1' without reloading substring. 2463 cmpl(cnt2, stride); 2464 // Have to check that we don't read beyond string. 2465 jccb(Assembler::lessEqual, ADJUST_STR); 2466 2467 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2468 // Compare the rest of substring (> 8 chars). 2469 movptr(str1, result); 2470 2471 cmpl(tmp, cnt2); 2472 // First 8 chars are already matched. 2473 jccb(Assembler::equal, CHECK_NEXT); 2474 2475 bind(SCAN_SUBSTR); 2476 pcmpestri(vec, Address(str1, 0), mode); 2477 // Need to reload strings pointers if not matched whole vector 2478 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2479 2480 bind(CHECK_NEXT); 2481 subl(cnt2, stride); 2482 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2483 addptr(str1, 16); 2484 if (ae == StrIntrinsicNode::UL) { 2485 addptr(str2, 8); 2486 } else { 2487 addptr(str2, 16); 2488 } 2489 subl(cnt1, stride); 2490 cmpl(cnt2, stride); // Do not read beyond substring 2491 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2492 // Back-up strings to avoid reading beyond substring. 2493 2494 if (ae == StrIntrinsicNode::UL) { 2495 lea(str2, Address(str2, cnt2, scale2, -8)); 2496 lea(str1, Address(str1, cnt2, scale1, -16)); 2497 } else { 2498 lea(str2, Address(str2, cnt2, scale2, -16)); 2499 lea(str1, Address(str1, cnt2, scale1, -16)); 2500 } 2501 subl(cnt1, cnt2); 2502 movl(cnt2, stride); 2503 addl(cnt1, stride); 2504 bind(CONT_SCAN_SUBSTR); 2505 if (ae == StrIntrinsicNode::UL) { 2506 pmovzxbw(vec, Address(str2, 0)); 2507 } else { 2508 movdqu(vec, Address(str2, 0)); 2509 } 2510 jmp(SCAN_SUBSTR); 2511 2512 bind(RET_FOUND_LONG); 2513 movptr(str1, Address(rsp, wordSize)); 2514 } // non constant 2515 2516 bind(RET_FOUND); 2517 // Compute substr offset 2518 subptr(result, str1); 2519 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2520 shrl(result, 1); // index 2521 } 2522 bind(CLEANUP); 2523 pop(rsp); // restore SP 2524 2525 } // string_indexof 2526 2527 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2528 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2529 ShortBranchVerifier sbv(this); 2530 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2531 2532 int stride = 8; 2533 2534 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2535 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2536 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2537 FOUND_SEQ_CHAR, DONE_LABEL; 2538 2539 movptr(result, str1); 2540 if (UseAVX >= 2) { 2541 cmpl(cnt1, stride); 2542 jcc(Assembler::less, SCAN_TO_CHAR); 2543 cmpl(cnt1, 2*stride); 2544 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2545 movdl(vec1, ch); 2546 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2547 vpxor(vec2, vec2); 2548 movl(tmp, cnt1); 2549 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2550 andl(cnt1,0x0000000F); //tail count (in chars) 2551 2552 bind(SCAN_TO_16_CHAR_LOOP); 2553 vmovdqu(vec3, Address(result, 0)); 2554 vpcmpeqw(vec3, vec3, vec1, 1); 2555 vptest(vec2, vec3); 2556 jcc(Assembler::carryClear, FOUND_CHAR); 2557 addptr(result, 32); 2558 subl(tmp, 2*stride); 2559 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2560 jmp(SCAN_TO_8_CHAR); 2561 bind(SCAN_TO_8_CHAR_INIT); 2562 movdl(vec1, ch); 2563 pshuflw(vec1, vec1, 0x00); 2564 pshufd(vec1, vec1, 0); 2565 pxor(vec2, vec2); 2566 } 2567 bind(SCAN_TO_8_CHAR); 2568 cmpl(cnt1, stride); 2569 jcc(Assembler::less, SCAN_TO_CHAR); 2570 if (UseAVX < 2) { 2571 movdl(vec1, ch); 2572 pshuflw(vec1, vec1, 0x00); 2573 pshufd(vec1, vec1, 0); 2574 pxor(vec2, vec2); 2575 } 2576 movl(tmp, cnt1); 2577 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2578 andl(cnt1,0x00000007); //tail count (in chars) 2579 2580 bind(SCAN_TO_8_CHAR_LOOP); 2581 movdqu(vec3, Address(result, 0)); 2582 pcmpeqw(vec3, vec1); 2583 ptest(vec2, vec3); 2584 jcc(Assembler::carryClear, FOUND_CHAR); 2585 addptr(result, 16); 2586 subl(tmp, stride); 2587 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2588 bind(SCAN_TO_CHAR); 2589 testl(cnt1, cnt1); 2590 jcc(Assembler::zero, RET_NOT_FOUND); 2591 bind(SCAN_TO_CHAR_LOOP); 2592 load_unsigned_short(tmp, Address(result, 0)); 2593 cmpl(ch, tmp); 2594 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2595 addptr(result, 2); 2596 subl(cnt1, 1); 2597 jccb(Assembler::zero, RET_NOT_FOUND); 2598 jmp(SCAN_TO_CHAR_LOOP); 2599 2600 bind(RET_NOT_FOUND); 2601 movl(result, -1); 2602 jmpb(DONE_LABEL); 2603 2604 bind(FOUND_CHAR); 2605 if (UseAVX >= 2) { 2606 vpmovmskb(tmp, vec3); 2607 } else { 2608 pmovmskb(tmp, vec3); 2609 } 2610 bsfl(ch, tmp); 2611 addl(result, ch); 2612 2613 bind(FOUND_SEQ_CHAR); 2614 subptr(result, str1); 2615 shrl(result, 1); 2616 2617 bind(DONE_LABEL); 2618 } // string_indexof_char 2619 2620 // helper function for string_compare 2621 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 2622 Address::ScaleFactor scale, Address::ScaleFactor scale1, 2623 Address::ScaleFactor scale2, Register index, int ae) { 2624 if (ae == StrIntrinsicNode::LL) { 2625 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 2626 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 2627 } else if (ae == StrIntrinsicNode::UU) { 2628 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 2629 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 2630 } else { 2631 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 2632 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 2633 } 2634 } 2635 2636 // Compare strings, used for char[] and byte[]. 2637 void C2_MacroAssembler::string_compare(Register str1, Register str2, 2638 Register cnt1, Register cnt2, Register result, 2639 XMMRegister vec1, int ae) { 2640 ShortBranchVerifier sbv(this); 2641 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 2642 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 2643 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 2644 int stride2x2 = 0x40; 2645 Address::ScaleFactor scale = Address::no_scale; 2646 Address::ScaleFactor scale1 = Address::no_scale; 2647 Address::ScaleFactor scale2 = Address::no_scale; 2648 2649 if (ae != StrIntrinsicNode::LL) { 2650 stride2x2 = 0x20; 2651 } 2652 2653 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 2654 shrl(cnt2, 1); 2655 } 2656 // Compute the minimum of the string lengths and the 2657 // difference of the string lengths (stack). 2658 // Do the conditional move stuff 2659 movl(result, cnt1); 2660 subl(cnt1, cnt2); 2661 push(cnt1); 2662 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 2663 2664 // Is the minimum length zero? 2665 testl(cnt2, cnt2); 2666 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2667 if (ae == StrIntrinsicNode::LL) { 2668 // Load first bytes 2669 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 2670 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 2671 } else if (ae == StrIntrinsicNode::UU) { 2672 // Load first characters 2673 load_unsigned_short(result, Address(str1, 0)); 2674 load_unsigned_short(cnt1, Address(str2, 0)); 2675 } else { 2676 load_unsigned_byte(result, Address(str1, 0)); 2677 load_unsigned_short(cnt1, Address(str2, 0)); 2678 } 2679 subl(result, cnt1); 2680 jcc(Assembler::notZero, POP_LABEL); 2681 2682 if (ae == StrIntrinsicNode::UU) { 2683 // Divide length by 2 to get number of chars 2684 shrl(cnt2, 1); 2685 } 2686 cmpl(cnt2, 1); 2687 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2688 2689 // Check if the strings start at the same location and setup scale and stride 2690 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2691 cmpptr(str1, str2); 2692 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2693 if (ae == StrIntrinsicNode::LL) { 2694 scale = Address::times_1; 2695 stride = 16; 2696 } else { 2697 scale = Address::times_2; 2698 stride = 8; 2699 } 2700 } else { 2701 scale1 = Address::times_1; 2702 scale2 = Address::times_2; 2703 // scale not used 2704 stride = 8; 2705 } 2706 2707 if (UseAVX >= 2 && UseSSE42Intrinsics) { 2708 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 2709 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 2710 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 2711 Label COMPARE_TAIL_LONG; 2712 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 2713 2714 int pcmpmask = 0x19; 2715 if (ae == StrIntrinsicNode::LL) { 2716 pcmpmask &= ~0x01; 2717 } 2718 2719 // Setup to compare 16-chars (32-bytes) vectors, 2720 // start from first character again because it has aligned address. 2721 if (ae == StrIntrinsicNode::LL) { 2722 stride2 = 32; 2723 } else { 2724 stride2 = 16; 2725 } 2726 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2727 adr_stride = stride << scale; 2728 } else { 2729 adr_stride1 = 8; //stride << scale1; 2730 adr_stride2 = 16; //stride << scale2; 2731 } 2732 2733 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2734 // rax and rdx are used by pcmpestri as elements counters 2735 movl(result, cnt2); 2736 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 2737 jcc(Assembler::zero, COMPARE_TAIL_LONG); 2738 2739 // fast path : compare first 2 8-char vectors. 2740 bind(COMPARE_16_CHARS); 2741 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2742 movdqu(vec1, Address(str1, 0)); 2743 } else { 2744 pmovzxbw(vec1, Address(str1, 0)); 2745 } 2746 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2747 jccb(Assembler::below, COMPARE_INDEX_CHAR); 2748 2749 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2750 movdqu(vec1, Address(str1, adr_stride)); 2751 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 2752 } else { 2753 pmovzxbw(vec1, Address(str1, adr_stride1)); 2754 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 2755 } 2756 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 2757 addl(cnt1, stride); 2758 2759 // Compare the characters at index in cnt1 2760 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 2761 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2762 subl(result, cnt2); 2763 jmp(POP_LABEL); 2764 2765 // Setup the registers to start vector comparison loop 2766 bind(COMPARE_WIDE_VECTORS); 2767 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2768 lea(str1, Address(str1, result, scale)); 2769 lea(str2, Address(str2, result, scale)); 2770 } else { 2771 lea(str1, Address(str1, result, scale1)); 2772 lea(str2, Address(str2, result, scale2)); 2773 } 2774 subl(result, stride2); 2775 subl(cnt2, stride2); 2776 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 2777 negptr(result); 2778 2779 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 2780 bind(COMPARE_WIDE_VECTORS_LOOP); 2781 2782 #ifdef _LP64 2783 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 2784 cmpl(cnt2, stride2x2); 2785 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 2786 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 2787 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 2788 2789 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 2790 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2791 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 2792 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2793 } else { 2794 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 2795 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2796 } 2797 kortestql(k7, k7); 2798 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 2799 addptr(result, stride2x2); // update since we already compared at this addr 2800 subl(cnt2, stride2x2); // and sub the size too 2801 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 2802 2803 vpxor(vec1, vec1); 2804 jmpb(COMPARE_WIDE_TAIL); 2805 }//if (VM_Version::supports_avx512vlbw()) 2806 #endif // _LP64 2807 2808 2809 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2810 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2811 vmovdqu(vec1, Address(str1, result, scale)); 2812 vpxor(vec1, Address(str2, result, scale)); 2813 } else { 2814 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 2815 vpxor(vec1, Address(str2, result, scale2)); 2816 } 2817 vptest(vec1, vec1); 2818 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 2819 addptr(result, stride2); 2820 subl(cnt2, stride2); 2821 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 2822 // clean upper bits of YMM registers 2823 vpxor(vec1, vec1); 2824 2825 // compare wide vectors tail 2826 bind(COMPARE_WIDE_TAIL); 2827 testptr(result, result); 2828 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2829 2830 movl(result, stride2); 2831 movl(cnt2, result); 2832 negptr(result); 2833 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2834 2835 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 2836 bind(VECTOR_NOT_EQUAL); 2837 // clean upper bits of YMM registers 2838 vpxor(vec1, vec1); 2839 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2840 lea(str1, Address(str1, result, scale)); 2841 lea(str2, Address(str2, result, scale)); 2842 } else { 2843 lea(str1, Address(str1, result, scale1)); 2844 lea(str2, Address(str2, result, scale2)); 2845 } 2846 jmp(COMPARE_16_CHARS); 2847 2848 // Compare tail chars, length between 1 to 15 chars 2849 bind(COMPARE_TAIL_LONG); 2850 movl(cnt2, result); 2851 cmpl(cnt2, stride); 2852 jcc(Assembler::less, COMPARE_SMALL_STR); 2853 2854 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2855 movdqu(vec1, Address(str1, 0)); 2856 } else { 2857 pmovzxbw(vec1, Address(str1, 0)); 2858 } 2859 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2860 jcc(Assembler::below, COMPARE_INDEX_CHAR); 2861 subptr(cnt2, stride); 2862 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2863 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2864 lea(str1, Address(str1, result, scale)); 2865 lea(str2, Address(str2, result, scale)); 2866 } else { 2867 lea(str1, Address(str1, result, scale1)); 2868 lea(str2, Address(str2, result, scale2)); 2869 } 2870 negptr(cnt2); 2871 jmpb(WHILE_HEAD_LABEL); 2872 2873 bind(COMPARE_SMALL_STR); 2874 } else if (UseSSE42Intrinsics) { 2875 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 2876 int pcmpmask = 0x19; 2877 // Setup to compare 8-char (16-byte) vectors, 2878 // start from first character again because it has aligned address. 2879 movl(result, cnt2); 2880 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 2881 if (ae == StrIntrinsicNode::LL) { 2882 pcmpmask &= ~0x01; 2883 } 2884 jcc(Assembler::zero, COMPARE_TAIL); 2885 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2886 lea(str1, Address(str1, result, scale)); 2887 lea(str2, Address(str2, result, scale)); 2888 } else { 2889 lea(str1, Address(str1, result, scale1)); 2890 lea(str2, Address(str2, result, scale2)); 2891 } 2892 negptr(result); 2893 2894 // pcmpestri 2895 // inputs: 2896 // vec1- substring 2897 // rax - negative string length (elements count) 2898 // mem - scanned string 2899 // rdx - string length (elements count) 2900 // pcmpmask - cmp mode: 11000 (string compare with negated result) 2901 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2902 // outputs: 2903 // rcx - first mismatched element index 2904 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2905 2906 bind(COMPARE_WIDE_VECTORS); 2907 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2908 movdqu(vec1, Address(str1, result, scale)); 2909 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2910 } else { 2911 pmovzxbw(vec1, Address(str1, result, scale1)); 2912 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2913 } 2914 // After pcmpestri cnt1(rcx) contains mismatched element index 2915 2916 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 2917 addptr(result, stride); 2918 subptr(cnt2, stride); 2919 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 2920 2921 // compare wide vectors tail 2922 testptr(result, result); 2923 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2924 2925 movl(cnt2, stride); 2926 movl(result, stride); 2927 negptr(result); 2928 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2929 movdqu(vec1, Address(str1, result, scale)); 2930 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2931 } else { 2932 pmovzxbw(vec1, Address(str1, result, scale1)); 2933 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2934 } 2935 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 2936 2937 // Mismatched characters in the vectors 2938 bind(VECTOR_NOT_EQUAL); 2939 addptr(cnt1, result); 2940 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2941 subl(result, cnt2); 2942 jmpb(POP_LABEL); 2943 2944 bind(COMPARE_TAIL); // limit is zero 2945 movl(cnt2, result); 2946 // Fallthru to tail compare 2947 } 2948 // Shift str2 and str1 to the end of the arrays, negate min 2949 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2950 lea(str1, Address(str1, cnt2, scale)); 2951 lea(str2, Address(str2, cnt2, scale)); 2952 } else { 2953 lea(str1, Address(str1, cnt2, scale1)); 2954 lea(str2, Address(str2, cnt2, scale2)); 2955 } 2956 decrementl(cnt2); // first character was compared already 2957 negptr(cnt2); 2958 2959 // Compare the rest of the elements 2960 bind(WHILE_HEAD_LABEL); 2961 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 2962 subl(result, cnt1); 2963 jccb(Assembler::notZero, POP_LABEL); 2964 increment(cnt2); 2965 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 2966 2967 // Strings are equal up to min length. Return the length difference. 2968 bind(LENGTH_DIFF_LABEL); 2969 pop(result); 2970 if (ae == StrIntrinsicNode::UU) { 2971 // Divide diff by 2 to get number of chars 2972 sarl(result, 1); 2973 } 2974 jmpb(DONE_LABEL); 2975 2976 #ifdef _LP64 2977 if (VM_Version::supports_avx512vlbw()) { 2978 2979 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 2980 2981 kmovql(cnt1, k7); 2982 notq(cnt1); 2983 bsfq(cnt2, cnt1); 2984 if (ae != StrIntrinsicNode::LL) { 2985 // Divide diff by 2 to get number of chars 2986 sarl(cnt2, 1); 2987 } 2988 addq(result, cnt2); 2989 if (ae == StrIntrinsicNode::LL) { 2990 load_unsigned_byte(cnt1, Address(str2, result)); 2991 load_unsigned_byte(result, Address(str1, result)); 2992 } else if (ae == StrIntrinsicNode::UU) { 2993 load_unsigned_short(cnt1, Address(str2, result, scale)); 2994 load_unsigned_short(result, Address(str1, result, scale)); 2995 } else { 2996 load_unsigned_short(cnt1, Address(str2, result, scale2)); 2997 load_unsigned_byte(result, Address(str1, result, scale1)); 2998 } 2999 subl(result, cnt1); 3000 jmpb(POP_LABEL); 3001 }//if (VM_Version::supports_avx512vlbw()) 3002 #endif // _LP64 3003 3004 // Discard the stored length difference 3005 bind(POP_LABEL); 3006 pop(cnt1); 3007 3008 // That's it 3009 bind(DONE_LABEL); 3010 if(ae == StrIntrinsicNode::UL) { 3011 negl(result); 3012 } 3013 3014 } 3015 3016 // Search for Non-ASCII character (Negative byte value) in a byte array, 3017 // return true if it has any and false otherwise. 3018 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3019 // @HotSpotIntrinsicCandidate 3020 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3021 // for (int i = off; i < off + len; i++) { 3022 // if (ba[i] < 0) { 3023 // return true; 3024 // } 3025 // } 3026 // return false; 3027 // } 3028 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3029 Register result, Register tmp1, 3030 XMMRegister vec1, XMMRegister vec2) { 3031 // rsi: byte array 3032 // rcx: len 3033 // rax: result 3034 ShortBranchVerifier sbv(this); 3035 assert_different_registers(ary1, len, result, tmp1); 3036 assert_different_registers(vec1, vec2); 3037 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3038 3039 // len == 0 3040 testl(len, len); 3041 jcc(Assembler::zero, FALSE_LABEL); 3042 3043 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3044 VM_Version::supports_avx512vlbw() && 3045 VM_Version::supports_bmi2()) { 3046 3047 Label test_64_loop, test_tail; 3048 Register tmp3_aliased = len; 3049 3050 movl(tmp1, len); 3051 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3052 3053 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3054 andl(len, ~(64 - 1)); // vector count (in chars) 3055 jccb(Assembler::zero, test_tail); 3056 3057 lea(ary1, Address(ary1, len, Address::times_1)); 3058 negptr(len); 3059 3060 bind(test_64_loop); 3061 // Check whether our 64 elements of size byte contain negatives 3062 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3063 kortestql(k2, k2); 3064 jcc(Assembler::notZero, TRUE_LABEL); 3065 3066 addptr(len, 64); 3067 jccb(Assembler::notZero, test_64_loop); 3068 3069 3070 bind(test_tail); 3071 // bail out when there is nothing to be done 3072 testl(tmp1, -1); 3073 jcc(Assembler::zero, FALSE_LABEL); 3074 3075 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3076 #ifdef _LP64 3077 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3078 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3079 notq(tmp3_aliased); 3080 kmovql(k3, tmp3_aliased); 3081 #else 3082 Label k_init; 3083 jmp(k_init); 3084 3085 // We could not read 64-bits from a general purpose register thus we move 3086 // data required to compose 64 1's to the instruction stream 3087 // We emit 64 byte wide series of elements from 0..63 which later on would 3088 // be used as a compare targets with tail count contained in tmp1 register. 3089 // Result would be a k register having tmp1 consecutive number or 1 3090 // counting from least significant bit. 3091 address tmp = pc(); 3092 emit_int64(0x0706050403020100); 3093 emit_int64(0x0F0E0D0C0B0A0908); 3094 emit_int64(0x1716151413121110); 3095 emit_int64(0x1F1E1D1C1B1A1918); 3096 emit_int64(0x2726252423222120); 3097 emit_int64(0x2F2E2D2C2B2A2928); 3098 emit_int64(0x3736353433323130); 3099 emit_int64(0x3F3E3D3C3B3A3938); 3100 3101 bind(k_init); 3102 lea(len, InternalAddress(tmp)); 3103 // create mask to test for negative byte inside a vector 3104 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3105 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); 3106 3107 #endif 3108 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3109 ktestq(k2, k3); 3110 jcc(Assembler::notZero, TRUE_LABEL); 3111 3112 jmp(FALSE_LABEL); 3113 } else { 3114 movl(result, len); // copy 3115 3116 if (UseAVX >= 2 && UseSSE >= 2) { 3117 // With AVX2, use 32-byte vector compare 3118 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3119 3120 // Compare 32-byte vectors 3121 andl(result, 0x0000001f); // tail count (in bytes) 3122 andl(len, 0xffffffe0); // vector count (in bytes) 3123 jccb(Assembler::zero, COMPARE_TAIL); 3124 3125 lea(ary1, Address(ary1, len, Address::times_1)); 3126 negptr(len); 3127 3128 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3129 movdl(vec2, tmp1); 3130 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3131 3132 bind(COMPARE_WIDE_VECTORS); 3133 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3134 vptest(vec1, vec2); 3135 jccb(Assembler::notZero, TRUE_LABEL); 3136 addptr(len, 32); 3137 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3138 3139 testl(result, result); 3140 jccb(Assembler::zero, FALSE_LABEL); 3141 3142 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3143 vptest(vec1, vec2); 3144 jccb(Assembler::notZero, TRUE_LABEL); 3145 jmpb(FALSE_LABEL); 3146 3147 bind(COMPARE_TAIL); // len is zero 3148 movl(len, result); 3149 // Fallthru to tail compare 3150 } else if (UseSSE42Intrinsics) { 3151 // With SSE4.2, use double quad vector compare 3152 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3153 3154 // Compare 16-byte vectors 3155 andl(result, 0x0000000f); // tail count (in bytes) 3156 andl(len, 0xfffffff0); // vector count (in bytes) 3157 jcc(Assembler::zero, COMPARE_TAIL); 3158 3159 lea(ary1, Address(ary1, len, Address::times_1)); 3160 negptr(len); 3161 3162 movl(tmp1, 0x80808080); 3163 movdl(vec2, tmp1); 3164 pshufd(vec2, vec2, 0); 3165 3166 bind(COMPARE_WIDE_VECTORS); 3167 movdqu(vec1, Address(ary1, len, Address::times_1)); 3168 ptest(vec1, vec2); 3169 jcc(Assembler::notZero, TRUE_LABEL); 3170 addptr(len, 16); 3171 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3172 3173 testl(result, result); 3174 jcc(Assembler::zero, FALSE_LABEL); 3175 3176 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3177 ptest(vec1, vec2); 3178 jccb(Assembler::notZero, TRUE_LABEL); 3179 jmpb(FALSE_LABEL); 3180 3181 bind(COMPARE_TAIL); // len is zero 3182 movl(len, result); 3183 // Fallthru to tail compare 3184 } 3185 } 3186 // Compare 4-byte vectors 3187 andl(len, 0xfffffffc); // vector count (in bytes) 3188 jccb(Assembler::zero, COMPARE_CHAR); 3189 3190 lea(ary1, Address(ary1, len, Address::times_1)); 3191 negptr(len); 3192 3193 bind(COMPARE_VECTORS); 3194 movl(tmp1, Address(ary1, len, Address::times_1)); 3195 andl(tmp1, 0x80808080); 3196 jccb(Assembler::notZero, TRUE_LABEL); 3197 addptr(len, 4); 3198 jcc(Assembler::notZero, COMPARE_VECTORS); 3199 3200 // Compare trailing char (final 2 bytes), if any 3201 bind(COMPARE_CHAR); 3202 testl(result, 0x2); // tail char 3203 jccb(Assembler::zero, COMPARE_BYTE); 3204 load_unsigned_short(tmp1, Address(ary1, 0)); 3205 andl(tmp1, 0x00008080); 3206 jccb(Assembler::notZero, TRUE_LABEL); 3207 subptr(result, 2); 3208 lea(ary1, Address(ary1, 2)); 3209 3210 bind(COMPARE_BYTE); 3211 testl(result, 0x1); // tail byte 3212 jccb(Assembler::zero, FALSE_LABEL); 3213 load_unsigned_byte(tmp1, Address(ary1, 0)); 3214 andl(tmp1, 0x00000080); 3215 jccb(Assembler::notEqual, TRUE_LABEL); 3216 jmpb(FALSE_LABEL); 3217 3218 bind(TRUE_LABEL); 3219 movl(result, 1); // return true 3220 jmpb(DONE); 3221 3222 bind(FALSE_LABEL); 3223 xorl(result, result); // return false 3224 3225 // That's it 3226 bind(DONE); 3227 if (UseAVX >= 2 && UseSSE >= 2) { 3228 // clean upper bits of YMM registers 3229 vpxor(vec1, vec1); 3230 vpxor(vec2, vec2); 3231 } 3232 } 3233 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3234 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3235 Register limit, Register result, Register chr, 3236 XMMRegister vec1, XMMRegister vec2, bool is_char) { 3237 ShortBranchVerifier sbv(this); 3238 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3239 3240 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3241 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3242 3243 if (is_array_equ) { 3244 // Check the input args 3245 cmpoop(ary1, ary2); 3246 jcc(Assembler::equal, TRUE_LABEL); 3247 3248 // Need additional checks for arrays_equals. 3249 testptr(ary1, ary1); 3250 jcc(Assembler::zero, FALSE_LABEL); 3251 testptr(ary2, ary2); 3252 jcc(Assembler::zero, FALSE_LABEL); 3253 3254 // Check the lengths 3255 movl(limit, Address(ary1, length_offset)); 3256 cmpl(limit, Address(ary2, length_offset)); 3257 jcc(Assembler::notEqual, FALSE_LABEL); 3258 } 3259 3260 // count == 0 3261 testl(limit, limit); 3262 jcc(Assembler::zero, TRUE_LABEL); 3263 3264 if (is_array_equ) { 3265 // Load array address 3266 lea(ary1, Address(ary1, base_offset)); 3267 lea(ary2, Address(ary2, base_offset)); 3268 } 3269 3270 if (is_array_equ && is_char) { 3271 // arrays_equals when used for char[]. 3272 shll(limit, 1); // byte count != 0 3273 } 3274 movl(result, limit); // copy 3275 3276 if (UseAVX >= 2) { 3277 // With AVX2, use 32-byte vector compare 3278 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3279 3280 // Compare 32-byte vectors 3281 andl(result, 0x0000001f); // tail count (in bytes) 3282 andl(limit, 0xffffffe0); // vector count (in bytes) 3283 jcc(Assembler::zero, COMPARE_TAIL); 3284 3285 lea(ary1, Address(ary1, limit, Address::times_1)); 3286 lea(ary2, Address(ary2, limit, Address::times_1)); 3287 negptr(limit); 3288 3289 #ifdef _LP64 3290 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3291 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3292 3293 cmpl(limit, -64); 3294 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3295 3296 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3297 3298 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3299 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3300 kortestql(k7, k7); 3301 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3302 addptr(limit, 64); // update since we already compared at this addr 3303 cmpl(limit, -64); 3304 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3305 3306 // At this point we may still need to compare -limit+result bytes. 3307 // We could execute the next two instruction and just continue via non-wide path: 3308 // cmpl(limit, 0); 3309 // jcc(Assembler::equal, COMPARE_TAIL); // true 3310 // But since we stopped at the points ary{1,2}+limit which are 3311 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3312 // (|limit| <= 32 and result < 32), 3313 // we may just compare the last 64 bytes. 3314 // 3315 addptr(result, -64); // it is safe, bc we just came from this area 3316 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3317 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3318 kortestql(k7, k7); 3319 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3320 3321 jmp(TRUE_LABEL); 3322 3323 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3324 3325 }//if (VM_Version::supports_avx512vlbw()) 3326 #endif //_LP64 3327 bind(COMPARE_WIDE_VECTORS); 3328 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3329 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3330 vpxor(vec1, vec2); 3331 3332 vptest(vec1, vec1); 3333 jcc(Assembler::notZero, FALSE_LABEL); 3334 addptr(limit, 32); 3335 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3336 3337 testl(result, result); 3338 jcc(Assembler::zero, TRUE_LABEL); 3339 3340 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3341 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3342 vpxor(vec1, vec2); 3343 3344 vptest(vec1, vec1); 3345 jccb(Assembler::notZero, FALSE_LABEL); 3346 jmpb(TRUE_LABEL); 3347 3348 bind(COMPARE_TAIL); // limit is zero 3349 movl(limit, result); 3350 // Fallthru to tail compare 3351 } else if (UseSSE42Intrinsics) { 3352 // With SSE4.2, use double quad vector compare 3353 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3354 3355 // Compare 16-byte vectors 3356 andl(result, 0x0000000f); // tail count (in bytes) 3357 andl(limit, 0xfffffff0); // vector count (in bytes) 3358 jcc(Assembler::zero, COMPARE_TAIL); 3359 3360 lea(ary1, Address(ary1, limit, Address::times_1)); 3361 lea(ary2, Address(ary2, limit, Address::times_1)); 3362 negptr(limit); 3363 3364 bind(COMPARE_WIDE_VECTORS); 3365 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3366 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3367 pxor(vec1, vec2); 3368 3369 ptest(vec1, vec1); 3370 jcc(Assembler::notZero, FALSE_LABEL); 3371 addptr(limit, 16); 3372 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3373 3374 testl(result, result); 3375 jcc(Assembler::zero, TRUE_LABEL); 3376 3377 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3378 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3379 pxor(vec1, vec2); 3380 3381 ptest(vec1, vec1); 3382 jccb(Assembler::notZero, FALSE_LABEL); 3383 jmpb(TRUE_LABEL); 3384 3385 bind(COMPARE_TAIL); // limit is zero 3386 movl(limit, result); 3387 // Fallthru to tail compare 3388 } 3389 3390 // Compare 4-byte vectors 3391 andl(limit, 0xfffffffc); // vector count (in bytes) 3392 jccb(Assembler::zero, COMPARE_CHAR); 3393 3394 lea(ary1, Address(ary1, limit, Address::times_1)); 3395 lea(ary2, Address(ary2, limit, Address::times_1)); 3396 negptr(limit); 3397 3398 bind(COMPARE_VECTORS); 3399 movl(chr, Address(ary1, limit, Address::times_1)); 3400 cmpl(chr, Address(ary2, limit, Address::times_1)); 3401 jccb(Assembler::notEqual, FALSE_LABEL); 3402 addptr(limit, 4); 3403 jcc(Assembler::notZero, COMPARE_VECTORS); 3404 3405 // Compare trailing char (final 2 bytes), if any 3406 bind(COMPARE_CHAR); 3407 testl(result, 0x2); // tail char 3408 jccb(Assembler::zero, COMPARE_BYTE); 3409 load_unsigned_short(chr, Address(ary1, 0)); 3410 load_unsigned_short(limit, Address(ary2, 0)); 3411 cmpl(chr, limit); 3412 jccb(Assembler::notEqual, FALSE_LABEL); 3413 3414 if (is_array_equ && is_char) { 3415 bind(COMPARE_BYTE); 3416 } else { 3417 lea(ary1, Address(ary1, 2)); 3418 lea(ary2, Address(ary2, 2)); 3419 3420 bind(COMPARE_BYTE); 3421 testl(result, 0x1); // tail byte 3422 jccb(Assembler::zero, TRUE_LABEL); 3423 load_unsigned_byte(chr, Address(ary1, 0)); 3424 load_unsigned_byte(limit, Address(ary2, 0)); 3425 cmpl(chr, limit); 3426 jccb(Assembler::notEqual, FALSE_LABEL); 3427 } 3428 bind(TRUE_LABEL); 3429 movl(result, 1); // return true 3430 jmpb(DONE); 3431 3432 bind(FALSE_LABEL); 3433 xorl(result, result); // return false 3434 3435 // That's it 3436 bind(DONE); 3437 if (UseAVX >= 2) { 3438 // clean upper bits of YMM registers 3439 vpxor(vec1, vec1); 3440 vpxor(vec2, vec2); 3441 } 3442 }