1 /* 2 * Copyright (c) 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "oops/methodData.hpp" 29 #include "opto/c2_MacroAssembler.hpp" 30 #include "opto/intrinsicnode.hpp" 31 #include "opto/opcodes.hpp" 32 #include "runtime/biasedLocking.hpp" 33 #include "runtime/objectMonitor.hpp" 34 #include "runtime/stubRoutines.hpp" 35 36 inline Assembler::AvxVectorLen C2_MacroAssembler::vector_length_encoding(int vlen_in_bytes) { 37 switch (vlen_in_bytes) { 38 case 4: // fall-through 39 case 8: // fall-through 40 case 16: return Assembler::AVX_128bit; 41 case 32: return Assembler::AVX_256bit; 42 case 64: return Assembler::AVX_512bit; 43 44 default: { 45 ShouldNotReachHere(); 46 return Assembler::AVX_NoVec; 47 } 48 } 49 } 50 51 void C2_MacroAssembler::setvectmask(Register dst, Register src) { 52 guarantee(PostLoopMultiversioning, "must be"); 53 Assembler::movl(dst, 1); 54 Assembler::shlxl(dst, dst, src); 55 Assembler::decl(dst); 56 Assembler::kmovdl(k1, dst); 57 Assembler::movl(dst, src); 58 } 59 60 void C2_MacroAssembler::restorevectmask() { 61 guarantee(PostLoopMultiversioning, "must be"); 62 Assembler::knotwl(k1, k0); 63 } 64 65 #if INCLUDE_RTM_OPT 66 67 // Update rtm_counters based on abort status 68 // input: abort_status 69 // rtm_counters (RTMLockingCounters*) 70 // flags are killed 71 void C2_MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 72 73 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 74 if (PrintPreciseRTMLockingStatistics) { 75 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 76 Label check_abort; 77 testl(abort_status, (1<<i)); 78 jccb(Assembler::equal, check_abort); 79 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 80 bind(check_abort); 81 } 82 } 83 } 84 85 // Branch if (random & (count-1) != 0), count is 2^n 86 // tmp, scr and flags are killed 87 void C2_MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 88 assert(tmp == rax, ""); 89 assert(scr == rdx, ""); 90 rdtsc(); // modifies EDX:EAX 91 andptr(tmp, count-1); 92 jccb(Assembler::notZero, brLabel); 93 } 94 95 // Perform abort ratio calculation, set no_rtm bit if high ratio 96 // input: rtm_counters_Reg (RTMLockingCounters* address) 97 // tmpReg, rtm_counters_Reg and flags are killed 98 void C2_MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 99 Register rtm_counters_Reg, 100 RTMLockingCounters* rtm_counters, 101 Metadata* method_data) { 102 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 103 104 if (RTMLockingCalculationDelay > 0) { 105 // Delay calculation 106 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 107 testptr(tmpReg, tmpReg); 108 jccb(Assembler::equal, L_done); 109 } 110 // Abort ratio calculation only if abort_count > RTMAbortThreshold 111 // Aborted transactions = abort_count * 100 112 // All transactions = total_count * RTMTotalCountIncrRate 113 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 114 115 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 116 cmpptr(tmpReg, RTMAbortThreshold); 117 jccb(Assembler::below, L_check_always_rtm2); 118 imulptr(tmpReg, tmpReg, 100); 119 120 Register scrReg = rtm_counters_Reg; 121 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 122 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 123 imulptr(scrReg, scrReg, RTMAbortRatio); 124 cmpptr(tmpReg, scrReg); 125 jccb(Assembler::below, L_check_always_rtm1); 126 if (method_data != NULL) { 127 // set rtm_state to "no rtm" in MDO 128 mov_metadata(tmpReg, method_data); 129 lock(); 130 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 131 } 132 jmpb(L_done); 133 bind(L_check_always_rtm1); 134 // Reload RTMLockingCounters* address 135 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 136 bind(L_check_always_rtm2); 137 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 138 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 139 jccb(Assembler::below, L_done); 140 if (method_data != NULL) { 141 // set rtm_state to "always rtm" in MDO 142 mov_metadata(tmpReg, method_data); 143 lock(); 144 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 145 } 146 bind(L_done); 147 } 148 149 // Update counters and perform abort ratio calculation 150 // input: abort_status_Reg 151 // rtm_counters_Reg, flags are killed 152 void C2_MacroAssembler::rtm_profiling(Register abort_status_Reg, 153 Register rtm_counters_Reg, 154 RTMLockingCounters* rtm_counters, 155 Metadata* method_data, 156 bool profile_rtm) { 157 158 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 159 // update rtm counters based on rax value at abort 160 // reads abort_status_Reg, updates flags 161 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 162 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 163 if (profile_rtm) { 164 // Save abort status because abort_status_Reg is used by following code. 165 if (RTMRetryCount > 0) { 166 push(abort_status_Reg); 167 } 168 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 169 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 170 // restore abort status 171 if (RTMRetryCount > 0) { 172 pop(abort_status_Reg); 173 } 174 } 175 } 176 177 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 178 // inputs: retry_count_Reg 179 // : abort_status_Reg 180 // output: retry_count_Reg decremented by 1 181 // flags are killed 182 void C2_MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 183 Label doneRetry; 184 assert(abort_status_Reg == rax, ""); 185 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 186 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 187 // if reason is in 0x6 and retry count != 0 then retry 188 andptr(abort_status_Reg, 0x6); 189 jccb(Assembler::zero, doneRetry); 190 testl(retry_count_Reg, retry_count_Reg); 191 jccb(Assembler::zero, doneRetry); 192 pause(); 193 decrementl(retry_count_Reg); 194 jmp(retryLabel); 195 bind(doneRetry); 196 } 197 198 // Spin and retry if lock is busy, 199 // inputs: box_Reg (monitor address) 200 // : retry_count_Reg 201 // output: retry_count_Reg decremented by 1 202 // : clear z flag if retry count exceeded 203 // tmp_Reg, scr_Reg, flags are killed 204 void C2_MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 205 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 206 Label SpinLoop, SpinExit, doneRetry; 207 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 208 209 testl(retry_count_Reg, retry_count_Reg); 210 jccb(Assembler::zero, doneRetry); 211 decrementl(retry_count_Reg); 212 movptr(scr_Reg, RTMSpinLoopCount); 213 214 bind(SpinLoop); 215 pause(); 216 decrementl(scr_Reg); 217 jccb(Assembler::lessEqual, SpinExit); 218 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 219 testptr(tmp_Reg, tmp_Reg); 220 jccb(Assembler::notZero, SpinLoop); 221 222 bind(SpinExit); 223 jmp(retryLabel); 224 bind(doneRetry); 225 incrementl(retry_count_Reg); // clear z flag 226 } 227 228 // Use RTM for normal stack locks 229 // Input: objReg (object to lock) 230 void C2_MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 231 Register retry_on_abort_count_Reg, 232 RTMLockingCounters* stack_rtm_counters, 233 Metadata* method_data, bool profile_rtm, 234 Label& DONE_LABEL, Label& IsInflated) { 235 assert(UseRTMForStackLocks, "why call this otherwise?"); 236 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 237 assert(tmpReg == rax, ""); 238 assert(scrReg == rdx, ""); 239 Label L_rtm_retry, L_decrement_retry, L_on_abort; 240 241 if (RTMRetryCount > 0) { 242 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 243 bind(L_rtm_retry); 244 } 245 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 246 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 247 jcc(Assembler::notZero, IsInflated); 248 249 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 250 Label L_noincrement; 251 if (RTMTotalCountIncrRate > 1) { 252 // tmpReg, scrReg and flags are killed 253 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 254 } 255 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 256 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 257 bind(L_noincrement); 258 } 259 xbegin(L_on_abort); 260 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 261 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 262 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 263 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 264 265 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 266 if (UseRTMXendForLockBusy) { 267 xend(); 268 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 269 jmp(L_decrement_retry); 270 } 271 else { 272 xabort(0); 273 } 274 bind(L_on_abort); 275 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 276 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 277 } 278 bind(L_decrement_retry); 279 if (RTMRetryCount > 0) { 280 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 281 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 282 } 283 } 284 285 // Use RTM for inflating locks 286 // inputs: objReg (object to lock) 287 // boxReg (on-stack box address (displaced header location) - KILLED) 288 // tmpReg (ObjectMonitor address + markWord::monitor_value) 289 void C2_MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 290 Register scrReg, Register retry_on_busy_count_Reg, 291 Register retry_on_abort_count_Reg, 292 RTMLockingCounters* rtm_counters, 293 Metadata* method_data, bool profile_rtm, 294 Label& DONE_LABEL) { 295 assert(UseRTMLocking, "why call this otherwise?"); 296 assert(tmpReg == rax, ""); 297 assert(scrReg == rdx, ""); 298 Label L_rtm_retry, L_decrement_retry, L_on_abort; 299 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 300 301 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 302 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 303 movptr(boxReg, tmpReg); // Save ObjectMonitor address 304 305 if (RTMRetryCount > 0) { 306 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 307 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 308 bind(L_rtm_retry); 309 } 310 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 311 Label L_noincrement; 312 if (RTMTotalCountIncrRate > 1) { 313 // tmpReg, scrReg and flags are killed 314 branch_on_random_using_rdtsc(tmpReg, scrReg, RTMTotalCountIncrRate, L_noincrement); 315 } 316 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 317 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 318 bind(L_noincrement); 319 } 320 xbegin(L_on_abort); 321 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); 322 movptr(tmpReg, Address(tmpReg, owner_offset)); 323 testptr(tmpReg, tmpReg); 324 jcc(Assembler::zero, DONE_LABEL); 325 if (UseRTMXendForLockBusy) { 326 xend(); 327 jmp(L_decrement_retry); 328 } 329 else { 330 xabort(0); 331 } 332 bind(L_on_abort); 333 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 334 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 335 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 336 } 337 if (RTMRetryCount > 0) { 338 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 339 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 340 } 341 342 movptr(tmpReg, Address(boxReg, owner_offset)) ; 343 testptr(tmpReg, tmpReg) ; 344 jccb(Assembler::notZero, L_decrement_retry) ; 345 346 // Appears unlocked - try to swing _owner from null to non-null. 347 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 348 #ifdef _LP64 349 Register threadReg = r15_thread; 350 #else 351 get_thread(scrReg); 352 Register threadReg = scrReg; 353 #endif 354 lock(); 355 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 356 357 if (RTMRetryCount > 0) { 358 // success done else retry 359 jccb(Assembler::equal, DONE_LABEL) ; 360 bind(L_decrement_retry); 361 // Spin and retry if lock is busy. 362 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 363 } 364 else { 365 bind(L_decrement_retry); 366 } 367 } 368 369 #endif // INCLUDE_RTM_OPT 370 371 // fast_lock and fast_unlock used by C2 372 373 // Because the transitions from emitted code to the runtime 374 // monitorenter/exit helper stubs are so slow it's critical that 375 // we inline both the stack-locking fast path and the inflated fast path. 376 // 377 // See also: cmpFastLock and cmpFastUnlock. 378 // 379 // What follows is a specialized inline transliteration of the code 380 // in enter() and exit(). If we're concerned about I$ bloat another 381 // option would be to emit TrySlowEnter and TrySlowExit methods 382 // at startup-time. These methods would accept arguments as 383 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 384 // indications in the icc.ZFlag. fast_lock and fast_unlock would simply 385 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 386 // In practice, however, the # of lock sites is bounded and is usually small. 387 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 388 // if the processor uses simple bimodal branch predictors keyed by EIP 389 // Since the helper routines would be called from multiple synchronization 390 // sites. 391 // 392 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 393 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 394 // to those specialized methods. That'd give us a mostly platform-independent 395 // implementation that the JITs could optimize and inline at their pleasure. 396 // Done correctly, the only time we'd need to cross to native could would be 397 // to park() or unpark() threads. We'd also need a few more unsafe operators 398 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 399 // (b) explicit barriers or fence operations. 400 // 401 // TODO: 402 // 403 // * Arrange for C2 to pass "Self" into fast_lock and fast_unlock in one of the registers (scr). 404 // This avoids manifesting the Self pointer in the fast_lock and fast_unlock terminals. 405 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 406 // the lock operators would typically be faster than reifying Self. 407 // 408 // * Ideally I'd define the primitives as: 409 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 410 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 411 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 412 // Instead, we're stuck with a rather awkward and brittle register assignments below. 413 // Furthermore the register assignments are overconstrained, possibly resulting in 414 // sub-optimal code near the synchronization site. 415 // 416 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 417 // Alternately, use a better sp-proximity test. 418 // 419 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 420 // Either one is sufficient to uniquely identify a thread. 421 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 422 // 423 // * Intrinsify notify() and notifyAll() for the common cases where the 424 // object is locked by the calling thread but the waitlist is empty. 425 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 426 // 427 // * use jccb and jmpb instead of jcc and jmp to improve code density. 428 // But beware of excessive branch density on AMD Opterons. 429 // 430 // * Both fast_lock and fast_unlock set the ICC.ZF to indicate success 431 // or failure of the fast path. If the fast path fails then we pass 432 // control to the slow path, typically in C. In fast_lock and 433 // fast_unlock we often branch to DONE_LABEL, just to find that C2 434 // will emit a conditional branch immediately after the node. 435 // So we have branches to branches and lots of ICC.ZF games. 436 // Instead, it might be better to have C2 pass a "FailureLabel" 437 // into fast_lock and fast_unlock. In the case of success, control 438 // will drop through the node. ICC.ZF is undefined at exit. 439 // In the case of failure, the node will branch directly to the 440 // FailureLabel 441 442 443 // obj: object to lock 444 // box: on-stack box address (displaced header location) - KILLED 445 // rax,: tmp -- KILLED 446 // scr: tmp -- KILLED 447 void C2_MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 448 Register scrReg, Register cx1Reg, Register cx2Reg, 449 BiasedLockingCounters* counters, 450 RTMLockingCounters* rtm_counters, 451 RTMLockingCounters* stack_rtm_counters, 452 Metadata* method_data, 453 bool use_rtm, bool profile_rtm) { 454 // Ensure the register assignments are disjoint 455 assert(tmpReg == rax, ""); 456 457 if (use_rtm) { 458 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 459 } else { 460 assert(cx2Reg == noreg, ""); 461 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 462 } 463 464 if (counters != NULL) { 465 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 466 } 467 468 // Possible cases that we'll encounter in fast_lock 469 // ------------------------------------------------ 470 // * Inflated 471 // -- unlocked 472 // -- Locked 473 // = by self 474 // = by other 475 // * biased 476 // -- by Self 477 // -- by other 478 // * neutral 479 // * stack-locked 480 // -- by self 481 // = sp-proximity test hits 482 // = sp-proximity test generates false-negative 483 // -- by other 484 // 485 486 Label IsInflated, DONE_LABEL; 487 488 // it's stack-locked, biased or neutral 489 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 490 // order to reduce the number of conditional branches in the most common cases. 491 // Beware -- there's a subtle invariant that fetch of the markword 492 // at [FETCH], below, will never observe a biased encoding (*101b). 493 // If this invariant is not held we risk exclusion (safety) failure. 494 if (UseBiasedLocking && !UseOptoBiasInlining) { 495 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, cx1Reg, false, DONE_LABEL, NULL, counters); 496 } 497 498 #if INCLUDE_RTM_OPT 499 if (UseRTMForStackLocks && use_rtm) { 500 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 501 stack_rtm_counters, method_data, profile_rtm, 502 DONE_LABEL, IsInflated); 503 } 504 #endif // INCLUDE_RTM_OPT 505 506 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // [FETCH] 507 testptr(tmpReg, markWord::monitor_value); // inflated vs stack-locked|neutral|biased 508 jccb(Assembler::notZero, IsInflated); 509 510 // Attempt stack-locking ... 511 orptr (tmpReg, markWord::unlocked_value); 512 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 513 lock(); 514 cmpxchgptr(boxReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Updates tmpReg 515 if (counters != NULL) { 516 cond_inc32(Assembler::equal, 517 ExternalAddress((address)counters->fast_path_entry_count_addr())); 518 } 519 jcc(Assembler::equal, DONE_LABEL); // Success 520 521 // Recursive locking. 522 // The object is stack-locked: markword contains stack pointer to BasicLock. 523 // Locked by current thread if difference with current SP is less than one page. 524 subptr(tmpReg, rsp); 525 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 526 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 527 movptr(Address(boxReg, 0), tmpReg); 528 if (counters != NULL) { 529 cond_inc32(Assembler::equal, 530 ExternalAddress((address)counters->fast_path_entry_count_addr())); 531 } 532 jmp(DONE_LABEL); 533 534 bind(IsInflated); 535 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markWord::monitor_value 536 537 #if INCLUDE_RTM_OPT 538 // Use the same RTM locking code in 32- and 64-bit VM. 539 if (use_rtm) { 540 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 541 rtm_counters, method_data, profile_rtm, DONE_LABEL); 542 } else { 543 #endif // INCLUDE_RTM_OPT 544 545 #ifndef _LP64 546 // The object is inflated. 547 548 // boxReg refers to the on-stack BasicLock in the current frame. 549 // We'd like to write: 550 // set box->_displaced_header = markWord::unused_mark(). Any non-0 value suffices. 551 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 552 // additional latency as we have another ST in the store buffer that must drain. 553 554 // avoid ST-before-CAS 555 // register juggle because we need tmpReg for cmpxchgptr below 556 movptr(scrReg, boxReg); 557 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 558 559 // Optimistic form: consider XORL tmpReg,tmpReg 560 movptr(tmpReg, NULL_WORD); 561 562 // Appears unlocked - try to swing _owner from null to non-null. 563 // Ideally, I'd manifest "Self" with get_thread and then attempt 564 // to CAS the register containing Self into m->Owner. 565 // But we don't have enough registers, so instead we can either try to CAS 566 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 567 // we later store "Self" into m->Owner. Transiently storing a stack address 568 // (rsp or the address of the box) into m->owner is harmless. 569 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 570 lock(); 571 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 572 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 573 // If we weren't able to swing _owner from NULL to the BasicLock 574 // then take the slow path. 575 jccb (Assembler::notZero, DONE_LABEL); 576 // update _owner from BasicLock to thread 577 get_thread (scrReg); // beware: clobbers ICCs 578 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 579 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 580 581 // If the CAS fails we can either retry or pass control to the slow path. 582 // We use the latter tactic. 583 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 584 // If the CAS was successful ... 585 // Self has acquired the lock 586 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 587 // Intentional fall-through into DONE_LABEL ... 588 #else // _LP64 589 // It's inflated and we use scrReg for ObjectMonitor* in this section. 590 movq(scrReg, tmpReg); 591 xorq(tmpReg, tmpReg); 592 lock(); 593 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 594 // Unconditionally set box->_displaced_header = markWord::unused_mark(). 595 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 596 movptr(Address(boxReg, 0), (int32_t)intptr_t(markWord::unused_mark().value())); 597 // Intentional fall-through into DONE_LABEL ... 598 // Propagate ICC.ZF from CAS above into DONE_LABEL. 599 #endif // _LP64 600 #if INCLUDE_RTM_OPT 601 } // use_rtm() 602 #endif 603 // DONE_LABEL is a hot target - we'd really like to place it at the 604 // start of cache line by padding with NOPs. 605 // See the AMD and Intel software optimization manuals for the 606 // most efficient "long" NOP encodings. 607 // Unfortunately none of our alignment mechanisms suffice. 608 bind(DONE_LABEL); 609 610 // At DONE_LABEL the icc ZFlag is set as follows ... 611 // fast_unlock uses the same protocol. 612 // ZFlag == 1 -> Success 613 // ZFlag == 0 -> Failure - force control through the slow path 614 } 615 616 // obj: object to unlock 617 // box: box address (displaced header location), killed. Must be EAX. 618 // tmp: killed, cannot be obj nor box. 619 // 620 // Some commentary on balanced locking: 621 // 622 // fast_lock and fast_unlock are emitted only for provably balanced lock sites. 623 // Methods that don't have provably balanced locking are forced to run in the 624 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 625 // The interpreter provides two properties: 626 // I1: At return-time the interpreter automatically and quietly unlocks any 627 // objects acquired the current activation (frame). Recall that the 628 // interpreter maintains an on-stack list of locks currently held by 629 // a frame. 630 // I2: If a method attempts to unlock an object that is not held by the 631 // the frame the interpreter throws IMSX. 632 // 633 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 634 // B() doesn't have provably balanced locking so it runs in the interpreter. 635 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 636 // is still locked by A(). 637 // 638 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 639 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 640 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 641 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 642 // Arguably given that the spec legislates the JNI case as undefined our implementation 643 // could reasonably *avoid* checking owner in fast_unlock(). 644 // In the interest of performance we elide m->Owner==Self check in unlock. 645 // A perfectly viable alternative is to elide the owner check except when 646 // Xcheck:jni is enabled. 647 648 void C2_MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 649 assert(boxReg == rax, ""); 650 assert_different_registers(objReg, boxReg, tmpReg); 651 652 Label DONE_LABEL, Stacked, CheckSucc; 653 654 // Critically, the biased locking test must have precedence over 655 // and appear before the (box->dhw == 0) recursive stack-lock test. 656 if (UseBiasedLocking && !UseOptoBiasInlining) { 657 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 658 } 659 660 #if INCLUDE_RTM_OPT 661 if (UseRTMForStackLocks && use_rtm) { 662 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 663 Label L_regular_unlock; 664 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // fetch markword 665 andptr(tmpReg, markWord::biased_lock_mask_in_place); // look at 3 lock bits 666 cmpptr(tmpReg, markWord::unlocked_value); // bits = 001 unlocked 667 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 668 xend(); // otherwise end... 669 jmp(DONE_LABEL); // ... and we're done 670 bind(L_regular_unlock); 671 } 672 #endif 673 674 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 675 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 676 movptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Examine the object's markword 677 testptr(tmpReg, markWord::monitor_value); // Inflated? 678 jccb (Assembler::zero, Stacked); 679 680 // It's inflated. 681 #if INCLUDE_RTM_OPT 682 if (use_rtm) { 683 Label L_regular_inflated_unlock; 684 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 685 movptr(boxReg, Address(tmpReg, owner_offset)); 686 testptr(boxReg, boxReg); 687 jccb(Assembler::notZero, L_regular_inflated_unlock); 688 xend(); 689 jmpb(DONE_LABEL); 690 bind(L_regular_inflated_unlock); 691 } 692 #endif 693 694 // Despite our balanced locking property we still check that m->_owner == Self 695 // as java routines or native JNI code called by this thread might 696 // have released the lock. 697 // Refer to the comments in synchronizer.cpp for how we might encode extra 698 // state in _succ so we can avoid fetching EntryList|cxq. 699 // 700 // I'd like to add more cases in fast_lock() and fast_unlock() -- 701 // such as recursive enter and exit -- but we have to be wary of 702 // I$ bloat, T$ effects and BP$ effects. 703 // 704 // If there's no contention try a 1-0 exit. That is, exit without 705 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 706 // we detect and recover from the race that the 1-0 exit admits. 707 // 708 // Conceptually fast_unlock() must execute a STST|LDST "release" barrier 709 // before it STs null into _owner, releasing the lock. Updates 710 // to data protected by the critical section must be visible before 711 // we drop the lock (and thus before any other thread could acquire 712 // the lock and observe the fields protected by the lock). 713 // IA32's memory-model is SPO, so STs are ordered with respect to 714 // each other and there's no need for an explicit barrier (fence). 715 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 716 #ifndef _LP64 717 get_thread (boxReg); 718 719 // Note that we could employ various encoding schemes to reduce 720 // the number of loads below (currently 4) to just 2 or 3. 721 // Refer to the comments in synchronizer.cpp. 722 // In practice the chain of fetches doesn't seem to impact performance, however. 723 xorptr(boxReg, boxReg); 724 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 725 jccb (Assembler::notZero, DONE_LABEL); 726 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 727 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 728 jccb (Assembler::notZero, CheckSucc); 729 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 730 jmpb (DONE_LABEL); 731 732 bind (Stacked); 733 // It's not inflated and it's not recursively stack-locked and it's not biased. 734 // It must be stack-locked. 735 // Try to reset the header to displaced header. 736 // The "box" value on the stack is stable, so we can reload 737 // and be assured we observe the same value as above. 738 movptr(tmpReg, Address(boxReg, 0)); 739 lock(); 740 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 741 // Intention fall-thru into DONE_LABEL 742 743 // DONE_LABEL is a hot target - we'd really like to place it at the 744 // start of cache line by padding with NOPs. 745 // See the AMD and Intel software optimization manuals for the 746 // most efficient "long" NOP encodings. 747 // Unfortunately none of our alignment mechanisms suffice. 748 bind (CheckSucc); 749 #else // _LP64 750 // It's inflated 751 xorptr(boxReg, boxReg); 752 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 753 jccb (Assembler::notZero, DONE_LABEL); 754 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 755 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 756 jccb (Assembler::notZero, CheckSucc); 757 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 758 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 759 jmpb (DONE_LABEL); 760 761 // Try to avoid passing control into the slow_path ... 762 Label LSuccess, LGoSlowPath ; 763 bind (CheckSucc); 764 765 // The following optional optimization can be elided if necessary 766 // Effectively: if (succ == null) goto slow path 767 // The code reduces the window for a race, however, 768 // and thus benefits performance. 769 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 770 jccb (Assembler::zero, LGoSlowPath); 771 772 xorptr(boxReg, boxReg); 773 // Without cast to int32_t this style of movptr will destroy r10 which is typically obj. 774 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 775 776 // Memory barrier/fence 777 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 778 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 779 // This is faster on Nehalem and AMD Shanghai/Barcelona. 780 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 781 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 782 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 783 lock(); addl(Address(rsp, 0), 0); 784 785 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 786 jccb (Assembler::notZero, LSuccess); 787 788 // Rare inopportune interleaving - race. 789 // The successor vanished in the small window above. 790 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 791 // We need to ensure progress and succession. 792 // Try to reacquire the lock. 793 // If that fails then the new owner is responsible for succession and this 794 // thread needs to take no further action and can exit via the fast path (success). 795 // If the re-acquire succeeds then pass control into the slow path. 796 // As implemented, this latter mode is horrible because we generated more 797 // coherence traffic on the lock *and* artifically extended the critical section 798 // length while by virtue of passing control into the slow path. 799 800 // box is really RAX -- the following CMPXCHG depends on that binding 801 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 802 lock(); 803 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 804 // There's no successor so we tried to regrab the lock. 805 // If that didn't work, then another thread grabbed the 806 // lock so we're done (and exit was a success). 807 jccb (Assembler::notEqual, LSuccess); 808 // Intentional fall-through into slow path 809 810 bind (LGoSlowPath); 811 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 812 jmpb (DONE_LABEL); 813 814 bind (LSuccess); 815 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 816 jmpb (DONE_LABEL); 817 818 bind (Stacked); 819 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 820 lock(); 821 cmpxchgptr(tmpReg, Address(objReg, oopDesc::mark_offset_in_bytes())); // Uses RAX which is box 822 823 #endif 824 bind(DONE_LABEL); 825 } 826 827 //------------------------------------------------------------------------------------------- 828 // Generic instructions support for use in .ad files C2 code generation 829 830 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 831 if (dst != src) { 832 movdqu(dst, src); 833 } 834 if (opcode == Op_AbsVD) { 835 andpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), scr); 836 } else { 837 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 838 xorpd(dst, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), scr); 839 } 840 } 841 842 void C2_MacroAssembler::vabsnegd(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 843 if (opcode == Op_AbsVD) { 844 vandpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_mask()), vector_len, scr); 845 } else { 846 assert((opcode == Op_NegVD),"opcode should be Op_NegD"); 847 vxorpd(dst, src, ExternalAddress(StubRoutines::x86::vector_double_sign_flip()), vector_len, scr); 848 } 849 } 850 851 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, Register scr) { 852 if (dst != src) { 853 movdqu(dst, src); 854 } 855 if (opcode == Op_AbsVF) { 856 andps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), scr); 857 } else { 858 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 859 xorps(dst, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), scr); 860 } 861 } 862 863 void C2_MacroAssembler::vabsnegf(int opcode, XMMRegister dst, XMMRegister src, int vector_len, Register scr) { 864 if (opcode == Op_AbsVF) { 865 vandps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_mask()), vector_len, scr); 866 } else { 867 assert((opcode == Op_NegVF),"opcode should be Op_NegF"); 868 vxorps(dst, src, ExternalAddress(StubRoutines::x86::vector_float_sign_flip()), vector_len, scr); 869 } 870 } 871 872 void C2_MacroAssembler::pminmax(int opcode, BasicType elem_bt, XMMRegister dst, XMMRegister src, XMMRegister tmp) { 873 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 874 875 if (opcode == Op_MinV) { 876 if (elem_bt == T_BYTE) { 877 pminsb(dst, src); 878 } else if (elem_bt == T_SHORT) { 879 pminsw(dst, src); 880 } else if (elem_bt == T_INT) { 881 pminsd(dst, src); 882 } else { 883 assert(elem_bt == T_LONG, "required"); 884 assert(tmp == xmm0, "required"); 885 movdqu(xmm0, dst); 886 pcmpgtq(xmm0, src); 887 blendvpd(dst, src); // xmm0 as mask 888 } 889 } else { // opcode == Op_MaxV 890 if (elem_bt == T_BYTE) { 891 pmaxsb(dst, src); 892 } else if (elem_bt == T_SHORT) { 893 pmaxsw(dst, src); 894 } else if (elem_bt == T_INT) { 895 pmaxsd(dst, src); 896 } else { 897 assert(elem_bt == T_LONG, "required"); 898 assert(tmp == xmm0, "required"); 899 movdqu(xmm0, src); 900 pcmpgtq(xmm0, dst); 901 blendvpd(dst, src); // xmm0 as mask 902 } 903 } 904 } 905 906 void C2_MacroAssembler::vpminmax(int opcode, BasicType elem_bt, 907 XMMRegister dst, XMMRegister src1, XMMRegister src2, 908 int vlen_enc) { 909 assert(opcode == Op_MinV || opcode == Op_MaxV, "sanity"); 910 911 if (opcode == Op_MinV) { 912 if (elem_bt == T_BYTE) { 913 vpminsb(dst, src1, src2, vlen_enc); 914 } else if (elem_bt == T_SHORT) { 915 vpminsw(dst, src1, src2, vlen_enc); 916 } else if (elem_bt == T_INT) { 917 vpminsd(dst, src1, src2, vlen_enc); 918 } else { 919 assert(elem_bt == T_LONG, "required"); 920 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 921 vpminsq(dst, src1, src2, vlen_enc); 922 } else { 923 vpcmpgtq(dst, src1, src2, vlen_enc); 924 vblendvpd(dst, src1, src2, dst, vlen_enc); 925 } 926 } 927 } else { // opcode == Op_MaxV 928 if (elem_bt == T_BYTE) { 929 vpmaxsb(dst, src1, src2, vlen_enc); 930 } else if (elem_bt == T_SHORT) { 931 vpmaxsw(dst, src1, src2, vlen_enc); 932 } else if (elem_bt == T_INT) { 933 vpmaxsd(dst, src1, src2, vlen_enc); 934 } else { 935 assert(elem_bt == T_LONG, "required"); 936 if (UseAVX > 2 && (vlen_enc == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 937 vpmaxsq(dst, src1, src2, vlen_enc); 938 } else { 939 vpcmpgtq(dst, src1, src2, vlen_enc); 940 vblendvpd(dst, src2, src1, dst, vlen_enc); 941 } 942 } 943 } 944 } 945 946 // Float/Double min max 947 948 void C2_MacroAssembler::vminmax_fp(int opcode, BasicType elem_bt, 949 XMMRegister dst, XMMRegister a, XMMRegister b, 950 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 951 int vlen_enc) { 952 assert(UseAVX > 0, "required"); 953 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 954 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 955 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 956 957 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 958 bool is_double_word = is_double_word_type(elem_bt); 959 960 if (!is_double_word && is_min) { 961 vblendvps(atmp, a, b, a, vlen_enc); 962 vblendvps(btmp, b, a, a, vlen_enc); 963 vminps(tmp, atmp, btmp, vlen_enc); 964 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 965 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 966 } else if (!is_double_word && !is_min) { 967 vblendvps(btmp, b, a, b, vlen_enc); 968 vblendvps(atmp, a, b, b, vlen_enc); 969 vmaxps(tmp, atmp, btmp, vlen_enc); 970 vcmpps(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 971 vblendvps(dst, tmp, atmp, btmp, vlen_enc); 972 } else if (is_double_word && is_min) { 973 vblendvpd(atmp, a, b, a, vlen_enc); 974 vblendvpd(btmp, b, a, a, vlen_enc); 975 vminpd(tmp, atmp, btmp, vlen_enc); 976 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 977 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 978 } else { 979 assert(is_double_word && !is_min, "sanity"); 980 vblendvpd(btmp, b, a, b, vlen_enc); 981 vblendvpd(atmp, a, b, b, vlen_enc); 982 vmaxpd(tmp, atmp, btmp, vlen_enc); 983 vcmppd(btmp, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 984 vblendvpd(dst, tmp, atmp, btmp, vlen_enc); 985 } 986 } 987 988 void C2_MacroAssembler::evminmax_fp(int opcode, BasicType elem_bt, 989 XMMRegister dst, XMMRegister a, XMMRegister b, 990 KRegister ktmp, XMMRegister atmp, XMMRegister btmp, 991 int vlen_enc) { 992 assert(UseAVX > 2, "required"); 993 assert(opcode == Op_MinV || opcode == Op_MinReductionV || 994 opcode == Op_MaxV || opcode == Op_MaxReductionV, "sanity"); 995 assert(elem_bt == T_FLOAT || elem_bt == T_DOUBLE, "sanity"); 996 997 bool is_min = (opcode == Op_MinV || opcode == Op_MinReductionV); 998 bool is_double_word = is_double_word_type(elem_bt); 999 bool merge = true; 1000 1001 if (!is_double_word && is_min) { 1002 evpmovd2m(ktmp, a, vlen_enc); 1003 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1004 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1005 vminps(dst, atmp, btmp, vlen_enc); 1006 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1007 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1008 } else if (!is_double_word && !is_min) { 1009 evpmovd2m(ktmp, b, vlen_enc); 1010 evblendmps(atmp, ktmp, a, b, merge, vlen_enc); 1011 evblendmps(btmp, ktmp, b, a, merge, vlen_enc); 1012 vmaxps(dst, atmp, btmp, vlen_enc); 1013 evcmpps(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1014 evmovdqul(dst, ktmp, atmp, merge, vlen_enc); 1015 } else if (is_double_word && is_min) { 1016 evpmovq2m(ktmp, a, vlen_enc); 1017 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1018 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1019 vminpd(dst, atmp, btmp, vlen_enc); 1020 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1021 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1022 } else { 1023 assert(is_double_word && !is_min, "sanity"); 1024 evpmovq2m(ktmp, b, vlen_enc); 1025 evblendmpd(atmp, ktmp, a, b, merge, vlen_enc); 1026 evblendmpd(btmp, ktmp, b, a, merge, vlen_enc); 1027 vmaxpd(dst, atmp, btmp, vlen_enc); 1028 evcmppd(ktmp, k0, atmp, atmp, Assembler::UNORD_Q, vlen_enc); 1029 evmovdquq(dst, ktmp, atmp, merge, vlen_enc); 1030 } 1031 } 1032 1033 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src) { 1034 if (sign) { 1035 pmovsxbw(dst, src); 1036 } else { 1037 pmovzxbw(dst, src); 1038 } 1039 } 1040 1041 void C2_MacroAssembler::vextendbw(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1042 if (sign) { 1043 vpmovsxbw(dst, src, vector_len); 1044 } else { 1045 vpmovzxbw(dst, src, vector_len); 1046 } 1047 } 1048 1049 void C2_MacroAssembler::vextendbd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1050 if (sign) { 1051 vpmovsxbd(dst, src, vector_len); 1052 } else { 1053 vpmovzxbd(dst, src, vector_len); 1054 } 1055 } 1056 1057 void C2_MacroAssembler::vextendwd(bool sign, XMMRegister dst, XMMRegister src, int vector_len) { 1058 if (sign) { 1059 vpmovsxwd(dst, src, vector_len); 1060 } else { 1061 vpmovzxwd(dst, src, vector_len); 1062 } 1063 } 1064 1065 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister shift) { 1066 switch (opcode) { 1067 case Op_RShiftVI: psrad(dst, shift); break; 1068 case Op_LShiftVI: pslld(dst, shift); break; 1069 case Op_URShiftVI: psrld(dst, shift); break; 1070 1071 default: assert(false, "%s", NodeClassNames[opcode]); 1072 } 1073 } 1074 1075 void C2_MacroAssembler::vshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1076 switch (opcode) { 1077 case Op_RShiftVI: vpsrad(dst, src, shift, vlen_enc); break; 1078 case Op_LShiftVI: vpslld(dst, src, shift, vlen_enc); break; 1079 case Op_URShiftVI: vpsrld(dst, src, shift, vlen_enc); break; 1080 1081 default: assert(false, "%s", NodeClassNames[opcode]); 1082 } 1083 } 1084 1085 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister shift) { 1086 switch (opcode) { 1087 case Op_RShiftVB: // fall-through 1088 case Op_RShiftVS: psraw(dst, shift); break; 1089 1090 case Op_LShiftVB: // fall-through 1091 case Op_LShiftVS: psllw(dst, shift); break; 1092 1093 case Op_URShiftVS: // fall-through 1094 case Op_URShiftVB: psrlw(dst, shift); break; 1095 1096 default: assert(false, "%s", NodeClassNames[opcode]); 1097 } 1098 } 1099 1100 void C2_MacroAssembler::vshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1101 switch (opcode) { 1102 case Op_RShiftVB: // fall-through 1103 case Op_RShiftVS: vpsraw(dst, src, shift, vlen_enc); break; 1104 1105 case Op_LShiftVB: // fall-through 1106 case Op_LShiftVS: vpsllw(dst, src, shift, vlen_enc); break; 1107 1108 case Op_URShiftVS: // fall-through 1109 case Op_URShiftVB: vpsrlw(dst, src, shift, vlen_enc); break; 1110 1111 default: assert(false, "%s", NodeClassNames[opcode]); 1112 } 1113 } 1114 1115 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister shift) { 1116 switch (opcode) { 1117 case Op_RShiftVL: psrlq(dst, shift); break; // using srl to implement sra on pre-avs512 systems 1118 case Op_LShiftVL: psllq(dst, shift); break; 1119 case Op_URShiftVL: psrlq(dst, shift); break; 1120 1121 default: assert(false, "%s", NodeClassNames[opcode]); 1122 } 1123 } 1124 1125 void C2_MacroAssembler::vshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1126 switch (opcode) { 1127 case Op_RShiftVL: evpsraq(dst, src, shift, vlen_enc); break; 1128 case Op_LShiftVL: vpsllq(dst, src, shift, vlen_enc); break; 1129 case Op_URShiftVL: vpsrlq(dst, src, shift, vlen_enc); break; 1130 1131 default: assert(false, "%s", NodeClassNames[opcode]); 1132 } 1133 } 1134 1135 void C2_MacroAssembler::varshiftd(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1136 switch (opcode) { 1137 case Op_RShiftVB: // fall-through 1138 case Op_RShiftVS: // fall-through 1139 case Op_RShiftVI: vpsravd(dst, src, shift, vlen_enc); break; 1140 1141 case Op_LShiftVB: // fall-through 1142 case Op_LShiftVS: // fall-through 1143 case Op_LShiftVI: vpsllvd(dst, src, shift, vlen_enc); break; 1144 1145 case Op_URShiftVB: // fall-through 1146 case Op_URShiftVS: // fall-through 1147 case Op_URShiftVI: vpsrlvd(dst, src, shift, vlen_enc); break; 1148 1149 default: assert(false, "%s", NodeClassNames[opcode]); 1150 } 1151 } 1152 1153 void C2_MacroAssembler::varshiftw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc) { 1154 switch (opcode) { 1155 case Op_RShiftVB: // fall-through 1156 case Op_RShiftVS: evpsravw(dst, src, shift, vlen_enc); break; 1157 1158 case Op_LShiftVB: // fall-through 1159 case Op_LShiftVS: evpsllvw(dst, src, shift, vlen_enc); break; 1160 1161 case Op_URShiftVB: // fall-through 1162 case Op_URShiftVS: evpsrlvw(dst, src, shift, vlen_enc); break; 1163 1164 default: assert(false, "%s", NodeClassNames[opcode]); 1165 } 1166 } 1167 1168 void C2_MacroAssembler::varshiftq(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vlen_enc, XMMRegister tmp) { 1169 assert(UseAVX >= 2, "required"); 1170 switch (opcode) { 1171 case Op_RShiftVL: { 1172 if (UseAVX > 2) { 1173 assert(tmp == xnoreg, "not used"); 1174 if (!VM_Version::supports_avx512vl()) { 1175 vlen_enc = Assembler::AVX_512bit; 1176 } 1177 evpsravq(dst, src, shift, vlen_enc); 1178 } else { 1179 vmovdqu(tmp, ExternalAddress(StubRoutines::x86::vector_long_sign_mask())); 1180 vpsrlvq(dst, src, shift, vlen_enc); 1181 vpsrlvq(tmp, tmp, shift, vlen_enc); 1182 vpxor(dst, dst, tmp, vlen_enc); 1183 vpsubq(dst, dst, tmp, vlen_enc); 1184 } 1185 break; 1186 } 1187 case Op_LShiftVL: { 1188 assert(tmp == xnoreg, "not used"); 1189 vpsllvq(dst, src, shift, vlen_enc); 1190 break; 1191 } 1192 case Op_URShiftVL: { 1193 assert(tmp == xnoreg, "not used"); 1194 vpsrlvq(dst, src, shift, vlen_enc); 1195 break; 1196 } 1197 default: assert(false, "%s", NodeClassNames[opcode]); 1198 } 1199 } 1200 1201 // Variable shift src by shift using vtmp and scratch as TEMPs giving word result in dst 1202 void C2_MacroAssembler::varshiftbw(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1203 assert(opcode == Op_LShiftVB || 1204 opcode == Op_RShiftVB || 1205 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1206 bool sign = (opcode != Op_URShiftVB); 1207 assert(vector_len == 0, "required"); 1208 vextendbd(sign, dst, src, 1); 1209 vpmovzxbd(vtmp, shift, 1); 1210 varshiftd(opcode, dst, dst, vtmp, 1); 1211 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_int_to_byte_mask()), 1, scratch); 1212 vextracti128_high(vtmp, dst); 1213 vpackusdw(dst, dst, vtmp, 0); 1214 } 1215 1216 // Variable shift src by shift using vtmp and scratch as TEMPs giving byte result in dst 1217 void C2_MacroAssembler::evarshiftb(int opcode, XMMRegister dst, XMMRegister src, XMMRegister shift, int vector_len, XMMRegister vtmp, Register scratch) { 1218 assert(opcode == Op_LShiftVB || 1219 opcode == Op_RShiftVB || 1220 opcode == Op_URShiftVB, "%s", NodeClassNames[opcode]); 1221 bool sign = (opcode != Op_URShiftVB); 1222 int ext_vector_len = vector_len + 1; 1223 vextendbw(sign, dst, src, ext_vector_len); 1224 vpmovzxbw(vtmp, shift, ext_vector_len); 1225 varshiftw(opcode, dst, dst, vtmp, ext_vector_len); 1226 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_short_to_byte_mask()), ext_vector_len, scratch); 1227 if (vector_len == 0) { 1228 vextracti128_high(vtmp, dst); 1229 vpackuswb(dst, dst, vtmp, vector_len); 1230 } else { 1231 vextracti64x4_high(vtmp, dst); 1232 vpackuswb(dst, dst, vtmp, vector_len); 1233 vpermq(dst, dst, 0xD8, vector_len); 1234 } 1235 } 1236 1237 void C2_MacroAssembler::insert(BasicType typ, XMMRegister dst, Register val, int idx) { 1238 switch(typ) { 1239 case T_BYTE: 1240 pinsrb(dst, val, idx); 1241 break; 1242 case T_SHORT: 1243 pinsrw(dst, val, idx); 1244 break; 1245 case T_INT: 1246 pinsrd(dst, val, idx); 1247 break; 1248 case T_LONG: 1249 pinsrq(dst, val, idx); 1250 break; 1251 default: 1252 assert(false,"Should not reach here."); 1253 break; 1254 } 1255 } 1256 1257 void C2_MacroAssembler::vinsert(BasicType typ, XMMRegister dst, XMMRegister src, Register val, int idx) { 1258 switch(typ) { 1259 case T_BYTE: 1260 vpinsrb(dst, src, val, idx); 1261 break; 1262 case T_SHORT: 1263 vpinsrw(dst, src, val, idx); 1264 break; 1265 case T_INT: 1266 vpinsrd(dst, src, val, idx); 1267 break; 1268 case T_LONG: 1269 vpinsrq(dst, src, val, idx); 1270 break; 1271 default: 1272 assert(false,"Should not reach here."); 1273 break; 1274 } 1275 } 1276 1277 void C2_MacroAssembler::vgather(BasicType typ, XMMRegister dst, Register base, XMMRegister idx, XMMRegister mask, int vector_len) { 1278 switch(typ) { 1279 case T_INT: 1280 vpgatherdd(dst, Address(base, idx, Address::times_4), mask, vector_len); 1281 break; 1282 case T_FLOAT: 1283 vgatherdps(dst, Address(base, idx, Address::times_4), mask, vector_len); 1284 break; 1285 case T_LONG: 1286 vpgatherdq(dst, Address(base, idx, Address::times_8), mask, vector_len); 1287 break; 1288 case T_DOUBLE: 1289 vgatherdpd(dst, Address(base, idx, Address::times_8), mask, vector_len); 1290 break; 1291 default: 1292 assert(false,"Should not reach here."); 1293 break; 1294 } 1295 } 1296 1297 void C2_MacroAssembler::evgather(BasicType typ, XMMRegister dst, KRegister mask, Register base, XMMRegister idx, int vector_len) { 1298 switch(typ) { 1299 case T_INT: 1300 evpgatherdd(dst, mask, Address(base, idx, Address::times_4), vector_len); 1301 break; 1302 case T_FLOAT: 1303 evgatherdps(dst, mask, Address(base, idx, Address::times_4), vector_len); 1304 break; 1305 case T_LONG: 1306 evpgatherdq(dst, mask, Address(base, idx, Address::times_8), vector_len); 1307 break; 1308 case T_DOUBLE: 1309 evgatherdpd(dst, mask, Address(base, idx, Address::times_8), vector_len); 1310 break; 1311 default: 1312 assert(false,"Should not reach here."); 1313 break; 1314 } 1315 } 1316 1317 void C2_MacroAssembler::evscatter(BasicType typ, Register base, XMMRegister idx, KRegister mask, XMMRegister src, int vector_len) { 1318 switch(typ) { 1319 case T_INT: 1320 evpscatterdd(Address(base, idx, Address::times_4), mask, src, vector_len); 1321 break; 1322 case T_FLOAT: 1323 evscatterdps(Address(base, idx, Address::times_4), mask, src, vector_len); 1324 break; 1325 case T_LONG: 1326 evpscatterdq(Address(base, idx, Address::times_8), mask, src, vector_len); 1327 break; 1328 case T_DOUBLE: 1329 evscatterdpd(Address(base, idx, Address::times_8), mask, src, vector_len); 1330 break; 1331 default: 1332 assert(false,"Should not reach here."); 1333 break; 1334 } 1335 } 1336 1337 void C2_MacroAssembler::load_vector_mask(XMMRegister dst, XMMRegister src, int vlen_in_bytes, BasicType elem_bt) { 1338 if (vlen_in_bytes <= 16) { 1339 pxor (dst, dst); 1340 psubb(dst, src); 1341 switch (elem_bt) { 1342 case T_BYTE: /* nothing to do */ break; 1343 case T_SHORT: pmovsxbw(dst, dst); break; 1344 case T_INT: pmovsxbd(dst, dst); break; 1345 case T_FLOAT: pmovsxbd(dst, dst); break; 1346 case T_LONG: pmovsxbq(dst, dst); break; 1347 case T_DOUBLE: pmovsxbq(dst, dst); break; 1348 1349 default: assert(false, "%s", type2name(elem_bt)); 1350 } 1351 } else { 1352 int vlen_enc = vector_length_encoding(vlen_in_bytes); 1353 1354 vpxor (dst, dst, dst, vlen_enc); 1355 vpsubb(dst, dst, src, vlen_enc); 1356 switch (elem_bt) { 1357 case T_BYTE: /* nothing to do */ break; 1358 case T_SHORT: vpmovsxbw(dst, dst, vlen_enc); break; 1359 case T_INT: vpmovsxbd(dst, dst, vlen_enc); break; 1360 case T_FLOAT: vpmovsxbd(dst, dst, vlen_enc); break; 1361 case T_LONG: vpmovsxbq(dst, dst, vlen_enc); break; 1362 case T_DOUBLE: vpmovsxbq(dst, dst, vlen_enc); break; 1363 1364 default: assert(false, "%s", type2name(elem_bt)); 1365 } 1366 } 1367 } 1368 1369 void C2_MacroAssembler::load_iota_indices(XMMRegister dst, Register scratch, int vlen_in_bytes) { 1370 ExternalAddress addr(StubRoutines::x86::vector_iota_indices()); 1371 if (vlen_in_bytes <= 16) { 1372 movdqu(dst, addr, scratch); 1373 } else if (vlen_in_bytes == 32) { 1374 vmovdqu(dst, addr, scratch); 1375 } else { 1376 assert(vlen_in_bytes == 64, "%d", vlen_in_bytes); 1377 evmovdqub(dst, k0, addr, false /*merge*/, Assembler::AVX_512bit, scratch); 1378 } 1379 } 1380 // Reductions for vectors of bytes, shorts, ints, longs, floats, and doubles. 1381 1382 void C2_MacroAssembler::reduce_operation_128(BasicType typ, int opcode, XMMRegister dst, XMMRegister src) { 1383 int vector_len = Assembler::AVX_128bit; 1384 1385 switch (opcode) { 1386 case Op_AndReductionV: pand(dst, src); break; 1387 case Op_OrReductionV: por (dst, src); break; 1388 case Op_XorReductionV: pxor(dst, src); break; 1389 case Op_MinReductionV: 1390 switch (typ) { 1391 case T_BYTE: pminsb(dst, src); break; 1392 case T_SHORT: pminsw(dst, src); break; 1393 case T_INT: pminsd(dst, src); break; 1394 case T_LONG: assert(UseAVX > 2, "required"); 1395 vpminsq(dst, dst, src, Assembler::AVX_128bit); break; 1396 default: assert(false, "wrong type"); 1397 } 1398 break; 1399 case Op_MaxReductionV: 1400 switch (typ) { 1401 case T_BYTE: pmaxsb(dst, src); break; 1402 case T_SHORT: pmaxsw(dst, src); break; 1403 case T_INT: pmaxsd(dst, src); break; 1404 case T_LONG: assert(UseAVX > 2, "required"); 1405 vpmaxsq(dst, dst, src, Assembler::AVX_128bit); break; 1406 default: assert(false, "wrong type"); 1407 } 1408 break; 1409 case Op_AddReductionVF: addss(dst, src); break; 1410 case Op_AddReductionVD: addsd(dst, src); break; 1411 case Op_AddReductionVI: 1412 switch (typ) { 1413 case T_BYTE: paddb(dst, src); break; 1414 case T_SHORT: paddw(dst, src); break; 1415 case T_INT: paddd(dst, src); break; 1416 default: assert(false, "wrong type"); 1417 } 1418 break; 1419 case Op_AddReductionVL: paddq(dst, src); break; 1420 case Op_MulReductionVF: mulss(dst, src); break; 1421 case Op_MulReductionVD: mulsd(dst, src); break; 1422 case Op_MulReductionVI: 1423 switch (typ) { 1424 case T_SHORT: pmullw(dst, src); break; 1425 case T_INT: pmulld(dst, src); break; 1426 default: assert(false, "wrong type"); 1427 } 1428 break; 1429 case Op_MulReductionVL: assert(UseAVX > 2, "required"); 1430 vpmullq(dst, dst, src, vector_len); break; 1431 default: assert(false, "wrong opcode"); 1432 } 1433 } 1434 1435 void C2_MacroAssembler::reduce_operation_256(BasicType typ, int opcode, XMMRegister dst, XMMRegister src1, XMMRegister src2) { 1436 int vector_len = Assembler::AVX_256bit; 1437 1438 switch (opcode) { 1439 case Op_AndReductionV: vpand(dst, src1, src2, vector_len); break; 1440 case Op_OrReductionV: vpor (dst, src1, src2, vector_len); break; 1441 case Op_XorReductionV: vpxor(dst, src1, src2, vector_len); break; 1442 case Op_MinReductionV: 1443 switch (typ) { 1444 case T_BYTE: vpminsb(dst, src1, src2, vector_len); break; 1445 case T_SHORT: vpminsw(dst, src1, src2, vector_len); break; 1446 case T_INT: vpminsd(dst, src1, src2, vector_len); break; 1447 case T_LONG: assert(UseAVX > 2, "required"); 1448 vpminsq(dst, src1, src2, vector_len); break; 1449 default: assert(false, "wrong type"); 1450 } 1451 break; 1452 case Op_MaxReductionV: 1453 switch (typ) { 1454 case T_BYTE: vpmaxsb(dst, src1, src2, vector_len); break; 1455 case T_SHORT: vpmaxsw(dst, src1, src2, vector_len); break; 1456 case T_INT: vpmaxsd(dst, src1, src2, vector_len); break; 1457 case T_LONG: assert(UseAVX > 2, "required"); 1458 vpmaxsq(dst, src1, src2, vector_len); break; 1459 default: assert(false, "wrong type"); 1460 } 1461 break; 1462 case Op_AddReductionVI: 1463 switch (typ) { 1464 case T_BYTE: vpaddb(dst, src1, src2, vector_len); break; 1465 case T_SHORT: vpaddw(dst, src1, src2, vector_len); break; 1466 case T_INT: vpaddd(dst, src1, src2, vector_len); break; 1467 default: assert(false, "wrong type"); 1468 } 1469 break; 1470 case Op_AddReductionVL: vpaddq(dst, src1, src2, vector_len); break; 1471 case Op_MulReductionVI: 1472 switch (typ) { 1473 case T_SHORT: vpmullw(dst, src1, src2, vector_len); break; 1474 case T_INT: vpmulld(dst, src1, src2, vector_len); break; 1475 default: assert(false, "wrong type"); 1476 } 1477 break; 1478 case Op_MulReductionVL: vpmullq(dst, src1, src2, vector_len); break; 1479 default: assert(false, "wrong opcode"); 1480 } 1481 } 1482 1483 void C2_MacroAssembler::reduce_fp(int opcode, int vlen, 1484 XMMRegister dst, XMMRegister src, 1485 XMMRegister vtmp1, XMMRegister vtmp2) { 1486 switch (opcode) { 1487 case Op_AddReductionVF: 1488 case Op_MulReductionVF: 1489 reduceF(opcode, vlen, dst, src, vtmp1, vtmp2); 1490 break; 1491 1492 case Op_AddReductionVD: 1493 case Op_MulReductionVD: 1494 reduceD(opcode, vlen, dst, src, vtmp1, vtmp2); 1495 break; 1496 1497 default: assert(false, "wrong opcode"); 1498 } 1499 } 1500 1501 void C2_MacroAssembler::reduceB(int opcode, int vlen, 1502 Register dst, Register src1, XMMRegister src2, 1503 XMMRegister vtmp1, XMMRegister vtmp2) { 1504 switch (vlen) { 1505 case 8: reduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1506 case 16: reduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1507 case 32: reduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1508 case 64: reduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1509 1510 default: assert(false, "wrong vector length"); 1511 } 1512 } 1513 1514 void C2_MacroAssembler::mulreduceB(int opcode, int vlen, 1515 Register dst, Register src1, XMMRegister src2, 1516 XMMRegister vtmp1, XMMRegister vtmp2) { 1517 switch (vlen) { 1518 case 8: mulreduce8B (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1519 case 16: mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1520 case 32: mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1521 case 64: mulreduce64B(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1522 1523 default: assert(false, "wrong vector length"); 1524 } 1525 } 1526 1527 void C2_MacroAssembler::reduceS(int opcode, int vlen, 1528 Register dst, Register src1, XMMRegister src2, 1529 XMMRegister vtmp1, XMMRegister vtmp2) { 1530 switch (vlen) { 1531 case 4: reduce4S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1532 case 8: reduce8S (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1533 case 16: reduce16S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1534 case 32: reduce32S(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1535 1536 default: assert(false, "wrong vector length"); 1537 } 1538 } 1539 1540 void C2_MacroAssembler::reduceI(int opcode, int vlen, 1541 Register dst, Register src1, XMMRegister src2, 1542 XMMRegister vtmp1, XMMRegister vtmp2) { 1543 switch (vlen) { 1544 case 2: reduce2I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1545 case 4: reduce4I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1546 case 8: reduce8I (opcode, dst, src1, src2, vtmp1, vtmp2); break; 1547 case 16: reduce16I(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1548 1549 default: assert(false, "wrong vector length"); 1550 } 1551 } 1552 1553 #ifdef _LP64 1554 void C2_MacroAssembler::reduceL(int opcode, int vlen, 1555 Register dst, Register src1, XMMRegister src2, 1556 XMMRegister vtmp1, XMMRegister vtmp2) { 1557 switch (vlen) { 1558 case 2: reduce2L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1559 case 4: reduce4L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1560 case 8: reduce8L(opcode, dst, src1, src2, vtmp1, vtmp2); break; 1561 1562 default: assert(false, "wrong vector length"); 1563 } 1564 } 1565 #endif // _LP64 1566 1567 void C2_MacroAssembler::reduceF(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1568 switch (vlen) { 1569 case 2: 1570 assert(vtmp2 == xnoreg, ""); 1571 reduce2F(opcode, dst, src, vtmp1); 1572 break; 1573 case 4: 1574 assert(vtmp2 == xnoreg, ""); 1575 reduce4F(opcode, dst, src, vtmp1); 1576 break; 1577 case 8: 1578 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1579 break; 1580 case 16: 1581 reduce16F(opcode, dst, src, vtmp1, vtmp2); 1582 break; 1583 default: assert(false, "wrong vector length"); 1584 } 1585 } 1586 1587 void C2_MacroAssembler::reduceD(int opcode, int vlen, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1588 switch (vlen) { 1589 case 2: 1590 assert(vtmp2 == xnoreg, ""); 1591 reduce2D(opcode, dst, src, vtmp1); 1592 break; 1593 case 4: 1594 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1595 break; 1596 case 8: 1597 reduce8D(opcode, dst, src, vtmp1, vtmp2); 1598 break; 1599 default: assert(false, "wrong vector length"); 1600 } 1601 } 1602 1603 void C2_MacroAssembler::reduce2I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1604 if (opcode == Op_AddReductionVI) { 1605 if (vtmp1 != src2) { 1606 movdqu(vtmp1, src2); 1607 } 1608 phaddd(vtmp1, vtmp1); 1609 } else { 1610 pshufd(vtmp1, src2, 0x1); 1611 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1612 } 1613 movdl(vtmp2, src1); 1614 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1615 movdl(dst, vtmp1); 1616 } 1617 1618 void C2_MacroAssembler::reduce4I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1619 if (opcode == Op_AddReductionVI) { 1620 if (vtmp1 != src2) { 1621 movdqu(vtmp1, src2); 1622 } 1623 phaddd(vtmp1, src2); 1624 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1625 } else { 1626 pshufd(vtmp2, src2, 0xE); 1627 reduce_operation_128(T_INT, opcode, vtmp2, src2); 1628 reduce2I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1629 } 1630 } 1631 1632 void C2_MacroAssembler::reduce8I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1633 if (opcode == Op_AddReductionVI) { 1634 vphaddd(vtmp1, src2, src2, Assembler::AVX_256bit); 1635 vextracti128_high(vtmp2, vtmp1); 1636 vpaddd(vtmp1, vtmp1, vtmp2, Assembler::AVX_128bit); 1637 reduce2I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1638 } else { 1639 vextracti128_high(vtmp1, src2); 1640 reduce_operation_128(T_INT, opcode, vtmp1, src2); 1641 reduce4I(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1642 } 1643 } 1644 1645 void C2_MacroAssembler::reduce16I(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1646 vextracti64x4_high(vtmp2, src2); 1647 reduce_operation_256(T_INT, opcode, vtmp2, vtmp2, src2); 1648 reduce8I(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1649 } 1650 1651 void C2_MacroAssembler::reduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1652 pshufd(vtmp2, src2, 0x1); 1653 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1654 movdqu(vtmp1, vtmp2); 1655 psrldq(vtmp1, 2); 1656 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1657 movdqu(vtmp2, vtmp1); 1658 psrldq(vtmp2, 1); 1659 reduce_operation_128(T_BYTE, opcode, vtmp1, vtmp2); 1660 movdl(vtmp2, src1); 1661 pmovsxbd(vtmp1, vtmp1); 1662 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1663 pextrb(dst, vtmp1, 0x0); 1664 movsbl(dst, dst); 1665 } 1666 1667 void C2_MacroAssembler::reduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1668 pshufd(vtmp1, src2, 0xE); 1669 reduce_operation_128(T_BYTE, opcode, vtmp1, src2); 1670 reduce8B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1671 } 1672 1673 void C2_MacroAssembler::reduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1674 vextracti128_high(vtmp2, src2); 1675 reduce_operation_128(T_BYTE, opcode, vtmp2, src2); 1676 reduce16B(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1677 } 1678 1679 void C2_MacroAssembler::reduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1680 vextracti64x4_high(vtmp1, src2); 1681 reduce_operation_256(T_BYTE, opcode, vtmp1, vtmp1, src2); 1682 reduce32B(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1683 } 1684 1685 void C2_MacroAssembler::mulreduce8B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1686 pmovsxbw(vtmp2, src2); 1687 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1688 } 1689 1690 void C2_MacroAssembler::mulreduce16B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1691 if (UseAVX > 1) { 1692 int vector_len = Assembler::AVX_256bit; 1693 vpmovsxbw(vtmp1, src2, vector_len); 1694 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1695 } else { 1696 pmovsxbw(vtmp2, src2); 1697 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1698 pshufd(vtmp2, src2, 0x1); 1699 pmovsxbw(vtmp2, src2); 1700 reduce8S(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1701 } 1702 } 1703 1704 void C2_MacroAssembler::mulreduce32B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1705 if (UseAVX > 2 && VM_Version::supports_avx512bw()) { 1706 int vector_len = Assembler::AVX_512bit; 1707 vpmovsxbw(vtmp1, src2, vector_len); 1708 reduce32S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1709 } else { 1710 assert(UseAVX >= 2,"Should not reach here."); 1711 mulreduce16B(opcode, dst, src1, src2, vtmp1, vtmp2); 1712 vextracti128_high(vtmp2, src2); 1713 mulreduce16B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1714 } 1715 } 1716 1717 void C2_MacroAssembler::mulreduce64B(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1718 mulreduce32B(opcode, dst, src1, src2, vtmp1, vtmp2); 1719 vextracti64x4_high(vtmp2, src2); 1720 mulreduce32B(opcode, dst, dst, vtmp2, vtmp1, vtmp2); 1721 } 1722 1723 void C2_MacroAssembler::reduce4S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1724 if (opcode == Op_AddReductionVI) { 1725 if (vtmp1 != src2) { 1726 movdqu(vtmp1, src2); 1727 } 1728 phaddw(vtmp1, vtmp1); 1729 phaddw(vtmp1, vtmp1); 1730 } else { 1731 pshufd(vtmp2, src2, 0x1); 1732 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1733 movdqu(vtmp1, vtmp2); 1734 psrldq(vtmp1, 2); 1735 reduce_operation_128(T_SHORT, opcode, vtmp1, vtmp2); 1736 } 1737 movdl(vtmp2, src1); 1738 pmovsxwd(vtmp1, vtmp1); 1739 reduce_operation_128(T_INT, opcode, vtmp1, vtmp2); 1740 pextrw(dst, vtmp1, 0x0); 1741 movswl(dst, dst); 1742 } 1743 1744 void C2_MacroAssembler::reduce8S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1745 if (opcode == Op_AddReductionVI) { 1746 if (vtmp1 != src2) { 1747 movdqu(vtmp1, src2); 1748 } 1749 phaddw(vtmp1, src2); 1750 } else { 1751 pshufd(vtmp1, src2, 0xE); 1752 reduce_operation_128(T_SHORT, opcode, vtmp1, src2); 1753 } 1754 reduce4S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1755 } 1756 1757 void C2_MacroAssembler::reduce16S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1758 if (opcode == Op_AddReductionVI) { 1759 int vector_len = Assembler::AVX_256bit; 1760 vphaddw(vtmp2, src2, src2, vector_len); 1761 vpermq(vtmp2, vtmp2, 0xD8, vector_len); 1762 } else { 1763 vextracti128_high(vtmp2, src2); 1764 reduce_operation_128(T_SHORT, opcode, vtmp2, src2); 1765 } 1766 reduce8S(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1767 } 1768 1769 void C2_MacroAssembler::reduce32S(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1770 int vector_len = Assembler::AVX_256bit; 1771 vextracti64x4_high(vtmp1, src2); 1772 reduce_operation_256(T_SHORT, opcode, vtmp1, vtmp1, src2); 1773 reduce16S(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1774 } 1775 1776 #ifdef _LP64 1777 void C2_MacroAssembler::reduce2L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1778 pshufd(vtmp2, src2, 0xE); 1779 reduce_operation_128(T_LONG, opcode, vtmp2, src2); 1780 movdq(vtmp1, src1); 1781 reduce_operation_128(T_LONG, opcode, vtmp1, vtmp2); 1782 movdq(dst, vtmp1); 1783 } 1784 1785 void C2_MacroAssembler::reduce4L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1786 vextracti128_high(vtmp1, src2); 1787 reduce_operation_128(T_LONG, opcode, vtmp1, src2); 1788 reduce2L(opcode, dst, src1, vtmp1, vtmp1, vtmp2); 1789 } 1790 1791 void C2_MacroAssembler::reduce8L(int opcode, Register dst, Register src1, XMMRegister src2, XMMRegister vtmp1, XMMRegister vtmp2) { 1792 vextracti64x4_high(vtmp2, src2); 1793 reduce_operation_256(T_LONG, opcode, vtmp2, vtmp2, src2); 1794 reduce4L(opcode, dst, src1, vtmp2, vtmp1, vtmp2); 1795 } 1796 #endif // _LP64 1797 1798 void C2_MacroAssembler::reduce2F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1799 reduce_operation_128(T_FLOAT, opcode, dst, src); 1800 pshufd(vtmp, src, 0x1); 1801 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1802 } 1803 1804 void C2_MacroAssembler::reduce4F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1805 reduce2F(opcode, dst, src, vtmp); 1806 pshufd(vtmp, src, 0x2); 1807 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1808 pshufd(vtmp, src, 0x3); 1809 reduce_operation_128(T_FLOAT, opcode, dst, vtmp); 1810 } 1811 1812 void C2_MacroAssembler::reduce8F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1813 reduce4F(opcode, dst, src, vtmp2); 1814 vextractf128_high(vtmp2, src); 1815 reduce4F(opcode, dst, vtmp2, vtmp1); 1816 } 1817 1818 void C2_MacroAssembler::reduce16F(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1819 reduce8F(opcode, dst, src, vtmp1, vtmp2); 1820 vextracti64x4_high(vtmp1, src); 1821 reduce8F(opcode, dst, vtmp1, vtmp1, vtmp2); 1822 } 1823 1824 void C2_MacroAssembler::reduce2D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp) { 1825 reduce_operation_128(T_DOUBLE, opcode, dst, src); 1826 pshufd(vtmp, src, 0xE); 1827 reduce_operation_128(T_DOUBLE, opcode, dst, vtmp); 1828 } 1829 1830 void C2_MacroAssembler::reduce4D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1831 reduce2D(opcode, dst, src, vtmp2); 1832 vextractf128_high(vtmp2, src); 1833 reduce2D(opcode, dst, vtmp2, vtmp1); 1834 } 1835 1836 void C2_MacroAssembler::reduce8D(int opcode, XMMRegister dst, XMMRegister src, XMMRegister vtmp1, XMMRegister vtmp2) { 1837 reduce4D(opcode, dst, src, vtmp1, vtmp2); 1838 vextracti64x4_high(vtmp1, src); 1839 reduce4D(opcode, dst, vtmp1, vtmp1, vtmp2); 1840 } 1841 1842 void C2_MacroAssembler::reduceFloatMinMax(int opcode, int vlen, bool is_dst_valid, 1843 XMMRegister dst, XMMRegister src, 1844 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1845 XMMRegister xmm_0, XMMRegister xmm_1) { 1846 int permconst[] = {1, 14}; 1847 XMMRegister wsrc = src; 1848 XMMRegister wdst = xmm_0; 1849 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1850 1851 int vlen_enc = Assembler::AVX_128bit; 1852 if (vlen == 16) { 1853 vlen_enc = Assembler::AVX_256bit; 1854 } 1855 1856 for (int i = log2(vlen) - 1; i >=0; i--) { 1857 if (i == 0 && !is_dst_valid) { 1858 wdst = dst; 1859 } 1860 if (i == 3) { 1861 vextracti64x4_high(wtmp, wsrc); 1862 } else if (i == 2) { 1863 vextracti128_high(wtmp, wsrc); 1864 } else { // i = [0,1] 1865 vpermilps(wtmp, wsrc, permconst[i], vlen_enc); 1866 } 1867 vminmax_fp(opcode, T_FLOAT, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1868 wsrc = wdst; 1869 vlen_enc = Assembler::AVX_128bit; 1870 } 1871 if (is_dst_valid) { 1872 vminmax_fp(opcode, T_FLOAT, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1873 } 1874 } 1875 1876 void C2_MacroAssembler::reduceDoubleMinMax(int opcode, int vlen, bool is_dst_valid, XMMRegister dst, XMMRegister src, 1877 XMMRegister tmp, XMMRegister atmp, XMMRegister btmp, 1878 XMMRegister xmm_0, XMMRegister xmm_1) { 1879 XMMRegister wsrc = src; 1880 XMMRegister wdst = xmm_0; 1881 XMMRegister wtmp = (xmm_1 == xnoreg) ? xmm_0: xmm_1; 1882 int vlen_enc = Assembler::AVX_128bit; 1883 if (vlen == 8) { 1884 vlen_enc = Assembler::AVX_256bit; 1885 } 1886 for (int i = log2(vlen) - 1; i >=0; i--) { 1887 if (i == 0 && !is_dst_valid) { 1888 wdst = dst; 1889 } 1890 if (i == 1) { 1891 vextracti128_high(wtmp, wsrc); 1892 } else if (i == 2) { 1893 vextracti64x4_high(wtmp, wsrc); 1894 } else { 1895 assert(i == 0, "%d", i); 1896 vpermilpd(wtmp, wsrc, 1, vlen_enc); 1897 } 1898 vminmax_fp(opcode, T_DOUBLE, wdst, wtmp, wsrc, tmp, atmp, btmp, vlen_enc); 1899 wsrc = wdst; 1900 vlen_enc = Assembler::AVX_128bit; 1901 } 1902 if (is_dst_valid) { 1903 vminmax_fp(opcode, T_DOUBLE, dst, wdst, dst, tmp, atmp, btmp, Assembler::AVX_128bit); 1904 } 1905 } 1906 1907 void C2_MacroAssembler::extract(BasicType bt, Register dst, XMMRegister src, int idx) { 1908 switch (bt) { 1909 case T_BYTE: pextrb(dst, src, idx); break; 1910 case T_SHORT: pextrw(dst, src, idx); break; 1911 case T_INT: pextrd(dst, src, idx); break; 1912 case T_LONG: pextrq(dst, src, idx); break; 1913 1914 default: 1915 assert(false,"Should not reach here."); 1916 break; 1917 } 1918 } 1919 1920 XMMRegister C2_MacroAssembler::get_lane(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex) { 1921 int esize = type2aelembytes(typ); 1922 int elem_per_lane = 16/esize; 1923 int lane = elemindex / elem_per_lane; 1924 int eindex = elemindex % elem_per_lane; 1925 1926 if (lane >= 2) { 1927 assert(UseAVX > 2, "required"); 1928 vextractf32x4(dst, src, lane & 3); 1929 return dst; 1930 } else if (lane > 0) { 1931 assert(UseAVX > 0, "required"); 1932 vextractf128(dst, src, lane); 1933 return dst; 1934 } else { 1935 return src; 1936 } 1937 } 1938 1939 void C2_MacroAssembler::get_elem(BasicType typ, Register dst, XMMRegister src, int elemindex) { 1940 int esize = type2aelembytes(typ); 1941 int elem_per_lane = 16/esize; 1942 int eindex = elemindex % elem_per_lane; 1943 assert(is_integral_type(typ),"required"); 1944 1945 if (eindex == 0) { 1946 if (typ == T_LONG) { 1947 movq(dst, src); 1948 } else { 1949 movdl(dst, src); 1950 if (typ == T_BYTE) 1951 movsbl(dst, dst); 1952 else if (typ == T_SHORT) 1953 movswl(dst, dst); 1954 } 1955 } else { 1956 extract(typ, dst, src, eindex); 1957 } 1958 } 1959 1960 void C2_MacroAssembler::get_elem(BasicType typ, XMMRegister dst, XMMRegister src, int elemindex, Register tmp, XMMRegister vtmp) { 1961 int esize = type2aelembytes(typ); 1962 int elem_per_lane = 16/esize; 1963 int eindex = elemindex % elem_per_lane; 1964 assert((typ == T_FLOAT || typ == T_DOUBLE),"required"); 1965 1966 if (eindex == 0) { 1967 movq(dst, src); 1968 } else { 1969 if (typ == T_FLOAT) { 1970 if (UseAVX == 0) { 1971 movdqu(dst, src); 1972 pshufps(dst, dst, eindex); 1973 } else { 1974 vpshufps(dst, src, src, eindex, Assembler::AVX_128bit); 1975 } 1976 } else { 1977 if (UseAVX == 0) { 1978 movdqu(dst, src); 1979 psrldq(dst, eindex*esize); 1980 } else { 1981 vpsrldq(dst, src, eindex*esize, Assembler::AVX_128bit); 1982 } 1983 movq(dst, dst); 1984 } 1985 } 1986 // Zero upper bits 1987 if (typ == T_FLOAT) { 1988 if (UseAVX == 0) { 1989 assert((vtmp != xnoreg) && (tmp != noreg), "required."); 1990 movdqu(vtmp, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), tmp); 1991 pand(dst, vtmp); 1992 } else { 1993 assert((tmp != noreg), "required."); 1994 vpand(dst, dst, ExternalAddress(StubRoutines::x86::vector_32_bit_mask()), Assembler::AVX_128bit, tmp); 1995 } 1996 } 1997 } 1998 1999 void C2_MacroAssembler::evpcmp(BasicType typ, KRegister kdmask, KRegister ksmask, XMMRegister src1, AddressLiteral adr, int comparison, int vector_len, Register scratch) { 2000 switch(typ) { 2001 case T_BYTE: 2002 evpcmpb(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 2003 break; 2004 case T_SHORT: 2005 evpcmpw(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 2006 break; 2007 case T_INT: 2008 case T_FLOAT: 2009 evpcmpd(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 2010 break; 2011 case T_LONG: 2012 case T_DOUBLE: 2013 evpcmpq(kdmask, ksmask, src1, adr, comparison, vector_len, scratch); 2014 break; 2015 default: 2016 assert(false,"Should not reach here."); 2017 break; 2018 } 2019 } 2020 2021 void C2_MacroAssembler::evpblend(BasicType typ, XMMRegister dst, KRegister kmask, XMMRegister src1, XMMRegister src2, bool merge, int vector_len) { 2022 switch(typ) { 2023 case T_BYTE: 2024 evpblendmb(dst, kmask, src1, src2, merge, vector_len); 2025 break; 2026 case T_SHORT: 2027 evpblendmw(dst, kmask, src1, src2, merge, vector_len); 2028 break; 2029 case T_INT: 2030 case T_FLOAT: 2031 evpblendmd(dst, kmask, src1, src2, merge, vector_len); 2032 break; 2033 case T_LONG: 2034 case T_DOUBLE: 2035 evpblendmq(dst, kmask, src1, src2, merge, vector_len); 2036 break; 2037 default: 2038 assert(false,"Should not reach here."); 2039 break; 2040 } 2041 } 2042 2043 //------------------------------------------------------------------------------------------- 2044 2045 // IndexOf for constant substrings with size >= 8 chars 2046 // which don't need to be loaded through stack. 2047 void C2_MacroAssembler::string_indexofC8(Register str1, Register str2, 2048 Register cnt1, Register cnt2, 2049 int int_cnt2, Register result, 2050 XMMRegister vec, Register tmp, 2051 int ae) { 2052 ShortBranchVerifier sbv(this); 2053 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2054 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2055 2056 // This method uses the pcmpestri instruction with bound registers 2057 // inputs: 2058 // xmm - substring 2059 // rax - substring length (elements count) 2060 // mem - scanned string 2061 // rdx - string length (elements count) 2062 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2063 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2064 // outputs: 2065 // rcx - matched index in string 2066 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2067 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2068 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2069 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2070 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2071 2072 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 2073 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 2074 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 2075 2076 // Note, inline_string_indexOf() generates checks: 2077 // if (substr.count > string.count) return -1; 2078 // if (substr.count == 0) return 0; 2079 assert(int_cnt2 >= stride, "this code is used only for cnt2 >= 8 chars"); 2080 2081 // Load substring. 2082 if (ae == StrIntrinsicNode::UL) { 2083 pmovzxbw(vec, Address(str2, 0)); 2084 } else { 2085 movdqu(vec, Address(str2, 0)); 2086 } 2087 movl(cnt2, int_cnt2); 2088 movptr(result, str1); // string addr 2089 2090 if (int_cnt2 > stride) { 2091 jmpb(SCAN_TO_SUBSTR); 2092 2093 // Reload substr for rescan, this code 2094 // is executed only for large substrings (> 8 chars) 2095 bind(RELOAD_SUBSTR); 2096 if (ae == StrIntrinsicNode::UL) { 2097 pmovzxbw(vec, Address(str2, 0)); 2098 } else { 2099 movdqu(vec, Address(str2, 0)); 2100 } 2101 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 2102 2103 bind(RELOAD_STR); 2104 // We came here after the beginning of the substring was 2105 // matched but the rest of it was not so we need to search 2106 // again. Start from the next element after the previous match. 2107 2108 // cnt2 is number of substring reminding elements and 2109 // cnt1 is number of string reminding elements when cmp failed. 2110 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 2111 subl(cnt1, cnt2); 2112 addl(cnt1, int_cnt2); 2113 movl(cnt2, int_cnt2); // Now restore cnt2 2114 2115 decrementl(cnt1); // Shift to next element 2116 cmpl(cnt1, cnt2); 2117 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2118 2119 addptr(result, (1<<scale1)); 2120 2121 } // (int_cnt2 > 8) 2122 2123 // Scan string for start of substr in 16-byte vectors 2124 bind(SCAN_TO_SUBSTR); 2125 pcmpestri(vec, Address(result, 0), mode); 2126 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2127 subl(cnt1, stride); 2128 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2129 cmpl(cnt1, cnt2); 2130 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2131 addptr(result, 16); 2132 jmpb(SCAN_TO_SUBSTR); 2133 2134 // Found a potential substr 2135 bind(FOUND_CANDIDATE); 2136 // Matched whole vector if first element matched (tmp(rcx) == 0). 2137 if (int_cnt2 == stride) { 2138 jccb(Assembler::overflow, RET_FOUND); // OF == 1 2139 } else { // int_cnt2 > 8 2140 jccb(Assembler::overflow, FOUND_SUBSTR); 2141 } 2142 // After pcmpestri tmp(rcx) contains matched element index 2143 // Compute start addr of substr 2144 lea(result, Address(result, tmp, scale1)); 2145 2146 // Make sure string is still long enough 2147 subl(cnt1, tmp); 2148 cmpl(cnt1, cnt2); 2149 if (int_cnt2 == stride) { 2150 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2151 } else { // int_cnt2 > 8 2152 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 2153 } 2154 // Left less then substring. 2155 2156 bind(RET_NOT_FOUND); 2157 movl(result, -1); 2158 jmp(EXIT); 2159 2160 if (int_cnt2 > stride) { 2161 // This code is optimized for the case when whole substring 2162 // is matched if its head is matched. 2163 bind(MATCH_SUBSTR_HEAD); 2164 pcmpestri(vec, Address(result, 0), mode); 2165 // Reload only string if does not match 2166 jcc(Assembler::noOverflow, RELOAD_STR); // OF == 0 2167 2168 Label CONT_SCAN_SUBSTR; 2169 // Compare the rest of substring (> 8 chars). 2170 bind(FOUND_SUBSTR); 2171 // First 8 chars are already matched. 2172 negptr(cnt2); 2173 addptr(cnt2, stride); 2174 2175 bind(SCAN_SUBSTR); 2176 subl(cnt1, stride); 2177 cmpl(cnt2, -stride); // Do not read beyond substring 2178 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 2179 // Back-up strings to avoid reading beyond substring: 2180 // cnt1 = cnt1 - cnt2 + 8 2181 addl(cnt1, cnt2); // cnt2 is negative 2182 addl(cnt1, stride); 2183 movl(cnt2, stride); negptr(cnt2); 2184 bind(CONT_SCAN_SUBSTR); 2185 if (int_cnt2 < (int)G) { 2186 int tail_off1 = int_cnt2<<scale1; 2187 int tail_off2 = int_cnt2<<scale2; 2188 if (ae == StrIntrinsicNode::UL) { 2189 pmovzxbw(vec, Address(str2, cnt2, scale2, tail_off2)); 2190 } else { 2191 movdqu(vec, Address(str2, cnt2, scale2, tail_off2)); 2192 } 2193 pcmpestri(vec, Address(result, cnt2, scale1, tail_off1), mode); 2194 } else { 2195 // calculate index in register to avoid integer overflow (int_cnt2*2) 2196 movl(tmp, int_cnt2); 2197 addptr(tmp, cnt2); 2198 if (ae == StrIntrinsicNode::UL) { 2199 pmovzxbw(vec, Address(str2, tmp, scale2, 0)); 2200 } else { 2201 movdqu(vec, Address(str2, tmp, scale2, 0)); 2202 } 2203 pcmpestri(vec, Address(result, tmp, scale1, 0), mode); 2204 } 2205 // Need to reload strings pointers if not matched whole vector 2206 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2207 addptr(cnt2, stride); 2208 jcc(Assembler::negative, SCAN_SUBSTR); 2209 // Fall through if found full substring 2210 2211 } // (int_cnt2 > 8) 2212 2213 bind(RET_FOUND); 2214 // Found result if we matched full small substring. 2215 // Compute substr offset 2216 subptr(result, str1); 2217 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2218 shrl(result, 1); // index 2219 } 2220 bind(EXIT); 2221 2222 } // string_indexofC8 2223 2224 // Small strings are loaded through stack if they cross page boundary. 2225 void C2_MacroAssembler::string_indexof(Register str1, Register str2, 2226 Register cnt1, Register cnt2, 2227 int int_cnt2, Register result, 2228 XMMRegister vec, Register tmp, 2229 int ae) { 2230 ShortBranchVerifier sbv(this); 2231 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2232 assert(ae != StrIntrinsicNode::LU, "Invalid encoding"); 2233 2234 // 2235 // int_cnt2 is length of small (< 8 chars) constant substring 2236 // or (-1) for non constant substring in which case its length 2237 // is in cnt2 register. 2238 // 2239 // Note, inline_string_indexOf() generates checks: 2240 // if (substr.count > string.count) return -1; 2241 // if (substr.count == 0) return 0; 2242 // 2243 int stride = (ae == StrIntrinsicNode::LL) ? 16 : 8; //UU, UL -> 8 2244 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < stride), "should be != 0"); 2245 // This method uses the pcmpestri instruction with bound registers 2246 // inputs: 2247 // xmm - substring 2248 // rax - substring length (elements count) 2249 // mem - scanned string 2250 // rdx - string length (elements count) 2251 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 2252 // 0xc - mode: 1100 (substring search) + 00 (unsigned bytes) 2253 // outputs: 2254 // rcx - matched index in string 2255 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2256 int mode = (ae == StrIntrinsicNode::LL) ? 0x0c : 0x0d; // bytes or shorts 2257 Address::ScaleFactor scale1 = (ae == StrIntrinsicNode::LL) ? Address::times_1 : Address::times_2; 2258 Address::ScaleFactor scale2 = (ae == StrIntrinsicNode::UL) ? Address::times_1 : scale1; 2259 2260 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 2261 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 2262 FOUND_CANDIDATE; 2263 2264 { //======================================================== 2265 // We don't know where these strings are located 2266 // and we can't read beyond them. Load them through stack. 2267 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 2268 2269 movptr(tmp, rsp); // save old SP 2270 2271 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 2272 if (int_cnt2 == (1>>scale2)) { // One byte 2273 assert((ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL), "Only possible for latin1 encoding"); 2274 load_unsigned_byte(result, Address(str2, 0)); 2275 movdl(vec, result); // move 32 bits 2276 } else if (ae == StrIntrinsicNode::LL && int_cnt2 == 3) { // Three bytes 2277 // Not enough header space in 32-bit VM: 12+3 = 15. 2278 movl(result, Address(str2, -1)); 2279 shrl(result, 8); 2280 movdl(vec, result); // move 32 bits 2281 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (2>>scale2)) { // One char 2282 load_unsigned_short(result, Address(str2, 0)); 2283 movdl(vec, result); // move 32 bits 2284 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (4>>scale2)) { // Two chars 2285 movdl(vec, Address(str2, 0)); // move 32 bits 2286 } else if (ae != StrIntrinsicNode::UL && int_cnt2 == (8>>scale2)) { // Four chars 2287 movq(vec, Address(str2, 0)); // move 64 bits 2288 } else { // cnt2 = { 3, 5, 6, 7 } || (ae == StrIntrinsicNode::UL && cnt2 ={2, ..., 7}) 2289 // Array header size is 12 bytes in 32-bit VM 2290 // + 6 bytes for 3 chars == 18 bytes, 2291 // enough space to load vec and shift. 2292 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 2293 if (ae == StrIntrinsicNode::UL) { 2294 int tail_off = int_cnt2-8; 2295 pmovzxbw(vec, Address(str2, tail_off)); 2296 psrldq(vec, -2*tail_off); 2297 } 2298 else { 2299 int tail_off = int_cnt2*(1<<scale2); 2300 movdqu(vec, Address(str2, tail_off-16)); 2301 psrldq(vec, 16-tail_off); 2302 } 2303 } 2304 } else { // not constant substring 2305 cmpl(cnt2, stride); 2306 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 2307 2308 // We can read beyond string if srt+16 does not cross page boundary 2309 // since heaps are aligned and mapped by pages. 2310 assert(os::vm_page_size() < (int)G, "default page should be small"); 2311 movl(result, str2); // We need only low 32 bits 2312 andl(result, (os::vm_page_size()-1)); 2313 cmpl(result, (os::vm_page_size()-16)); 2314 jccb(Assembler::belowEqual, CHECK_STR); 2315 2316 // Move small strings to stack to allow load 16 bytes into vec. 2317 subptr(rsp, 16); 2318 int stk_offset = wordSize-(1<<scale2); 2319 push(cnt2); 2320 2321 bind(COPY_SUBSTR); 2322 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UL) { 2323 load_unsigned_byte(result, Address(str2, cnt2, scale2, -1)); 2324 movb(Address(rsp, cnt2, scale2, stk_offset), result); 2325 } else if (ae == StrIntrinsicNode::UU) { 2326 load_unsigned_short(result, Address(str2, cnt2, scale2, -2)); 2327 movw(Address(rsp, cnt2, scale2, stk_offset), result); 2328 } 2329 decrement(cnt2); 2330 jccb(Assembler::notZero, COPY_SUBSTR); 2331 2332 pop(cnt2); 2333 movptr(str2, rsp); // New substring address 2334 } // non constant 2335 2336 bind(CHECK_STR); 2337 cmpl(cnt1, stride); 2338 jccb(Assembler::aboveEqual, BIG_STRINGS); 2339 2340 // Check cross page boundary. 2341 movl(result, str1); // We need only low 32 bits 2342 andl(result, (os::vm_page_size()-1)); 2343 cmpl(result, (os::vm_page_size()-16)); 2344 jccb(Assembler::belowEqual, BIG_STRINGS); 2345 2346 subptr(rsp, 16); 2347 int stk_offset = -(1<<scale1); 2348 if (int_cnt2 < 0) { // not constant 2349 push(cnt2); 2350 stk_offset += wordSize; 2351 } 2352 movl(cnt2, cnt1); 2353 2354 bind(COPY_STR); 2355 if (ae == StrIntrinsicNode::LL) { 2356 load_unsigned_byte(result, Address(str1, cnt2, scale1, -1)); 2357 movb(Address(rsp, cnt2, scale1, stk_offset), result); 2358 } else { 2359 load_unsigned_short(result, Address(str1, cnt2, scale1, -2)); 2360 movw(Address(rsp, cnt2, scale1, stk_offset), result); 2361 } 2362 decrement(cnt2); 2363 jccb(Assembler::notZero, COPY_STR); 2364 2365 if (int_cnt2 < 0) { // not constant 2366 pop(cnt2); 2367 } 2368 movptr(str1, rsp); // New string address 2369 2370 bind(BIG_STRINGS); 2371 // Load substring. 2372 if (int_cnt2 < 0) { // -1 2373 if (ae == StrIntrinsicNode::UL) { 2374 pmovzxbw(vec, Address(str2, 0)); 2375 } else { 2376 movdqu(vec, Address(str2, 0)); 2377 } 2378 push(cnt2); // substr count 2379 push(str2); // substr addr 2380 push(str1); // string addr 2381 } else { 2382 // Small (< 8 chars) constant substrings are loaded already. 2383 movl(cnt2, int_cnt2); 2384 } 2385 push(tmp); // original SP 2386 2387 } // Finished loading 2388 2389 //======================================================== 2390 // Start search 2391 // 2392 2393 movptr(result, str1); // string addr 2394 2395 if (int_cnt2 < 0) { // Only for non constant substring 2396 jmpb(SCAN_TO_SUBSTR); 2397 2398 // SP saved at sp+0 2399 // String saved at sp+1*wordSize 2400 // Substr saved at sp+2*wordSize 2401 // Substr count saved at sp+3*wordSize 2402 2403 // Reload substr for rescan, this code 2404 // is executed only for large substrings (> 8 chars) 2405 bind(RELOAD_SUBSTR); 2406 movptr(str2, Address(rsp, 2*wordSize)); 2407 movl(cnt2, Address(rsp, 3*wordSize)); 2408 if (ae == StrIntrinsicNode::UL) { 2409 pmovzxbw(vec, Address(str2, 0)); 2410 } else { 2411 movdqu(vec, Address(str2, 0)); 2412 } 2413 // We came here after the beginning of the substring was 2414 // matched but the rest of it was not so we need to search 2415 // again. Start from the next element after the previous match. 2416 subptr(str1, result); // Restore counter 2417 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2418 shrl(str1, 1); 2419 } 2420 addl(cnt1, str1); 2421 decrementl(cnt1); // Shift to next element 2422 cmpl(cnt1, cnt2); 2423 jcc(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2424 2425 addptr(result, (1<<scale1)); 2426 } // non constant 2427 2428 // Scan string for start of substr in 16-byte vectors 2429 bind(SCAN_TO_SUBSTR); 2430 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 2431 pcmpestri(vec, Address(result, 0), mode); 2432 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 2433 subl(cnt1, stride); 2434 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 2435 cmpl(cnt1, cnt2); 2436 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 2437 addptr(result, 16); 2438 2439 bind(ADJUST_STR); 2440 cmpl(cnt1, stride); // Do not read beyond string 2441 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 2442 // Back-up string to avoid reading beyond string. 2443 lea(result, Address(result, cnt1, scale1, -16)); 2444 movl(cnt1, stride); 2445 jmpb(SCAN_TO_SUBSTR); 2446 2447 // Found a potential substr 2448 bind(FOUND_CANDIDATE); 2449 // After pcmpestri tmp(rcx) contains matched element index 2450 2451 // Make sure string is still long enough 2452 subl(cnt1, tmp); 2453 cmpl(cnt1, cnt2); 2454 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 2455 // Left less then substring. 2456 2457 bind(RET_NOT_FOUND); 2458 movl(result, -1); 2459 jmp(CLEANUP); 2460 2461 bind(FOUND_SUBSTR); 2462 // Compute start addr of substr 2463 lea(result, Address(result, tmp, scale1)); 2464 if (int_cnt2 > 0) { // Constant substring 2465 // Repeat search for small substring (< 8 chars) 2466 // from new point without reloading substring. 2467 // Have to check that we don't read beyond string. 2468 cmpl(tmp, stride-int_cnt2); 2469 jccb(Assembler::greater, ADJUST_STR); 2470 // Fall through if matched whole substring. 2471 } else { // non constant 2472 assert(int_cnt2 == -1, "should be != 0"); 2473 2474 addl(tmp, cnt2); 2475 // Found result if we matched whole substring. 2476 cmpl(tmp, stride); 2477 jcc(Assembler::lessEqual, RET_FOUND); 2478 2479 // Repeat search for small substring (<= 8 chars) 2480 // from new point 'str1' without reloading substring. 2481 cmpl(cnt2, stride); 2482 // Have to check that we don't read beyond string. 2483 jccb(Assembler::lessEqual, ADJUST_STR); 2484 2485 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 2486 // Compare the rest of substring (> 8 chars). 2487 movptr(str1, result); 2488 2489 cmpl(tmp, cnt2); 2490 // First 8 chars are already matched. 2491 jccb(Assembler::equal, CHECK_NEXT); 2492 2493 bind(SCAN_SUBSTR); 2494 pcmpestri(vec, Address(str1, 0), mode); 2495 // Need to reload strings pointers if not matched whole vector 2496 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 2497 2498 bind(CHECK_NEXT); 2499 subl(cnt2, stride); 2500 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 2501 addptr(str1, 16); 2502 if (ae == StrIntrinsicNode::UL) { 2503 addptr(str2, 8); 2504 } else { 2505 addptr(str2, 16); 2506 } 2507 subl(cnt1, stride); 2508 cmpl(cnt2, stride); // Do not read beyond substring 2509 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 2510 // Back-up strings to avoid reading beyond substring. 2511 2512 if (ae == StrIntrinsicNode::UL) { 2513 lea(str2, Address(str2, cnt2, scale2, -8)); 2514 lea(str1, Address(str1, cnt2, scale1, -16)); 2515 } else { 2516 lea(str2, Address(str2, cnt2, scale2, -16)); 2517 lea(str1, Address(str1, cnt2, scale1, -16)); 2518 } 2519 subl(cnt1, cnt2); 2520 movl(cnt2, stride); 2521 addl(cnt1, stride); 2522 bind(CONT_SCAN_SUBSTR); 2523 if (ae == StrIntrinsicNode::UL) { 2524 pmovzxbw(vec, Address(str2, 0)); 2525 } else { 2526 movdqu(vec, Address(str2, 0)); 2527 } 2528 jmp(SCAN_SUBSTR); 2529 2530 bind(RET_FOUND_LONG); 2531 movptr(str1, Address(rsp, wordSize)); 2532 } // non constant 2533 2534 bind(RET_FOUND); 2535 // Compute substr offset 2536 subptr(result, str1); 2537 if (ae == StrIntrinsicNode::UU || ae == StrIntrinsicNode::UL) { 2538 shrl(result, 1); // index 2539 } 2540 bind(CLEANUP); 2541 pop(rsp); // restore SP 2542 2543 } // string_indexof 2544 2545 void C2_MacroAssembler::string_indexof_char(Register str1, Register cnt1, Register ch, Register result, 2546 XMMRegister vec1, XMMRegister vec2, XMMRegister vec3, Register tmp) { 2547 ShortBranchVerifier sbv(this); 2548 assert(UseSSE42Intrinsics, "SSE4.2 intrinsics are required"); 2549 2550 int stride = 8; 2551 2552 Label FOUND_CHAR, SCAN_TO_CHAR, SCAN_TO_CHAR_LOOP, 2553 SCAN_TO_8_CHAR, SCAN_TO_8_CHAR_LOOP, SCAN_TO_16_CHAR_LOOP, 2554 RET_NOT_FOUND, SCAN_TO_8_CHAR_INIT, 2555 FOUND_SEQ_CHAR, DONE_LABEL; 2556 2557 movptr(result, str1); 2558 if (UseAVX >= 2) { 2559 cmpl(cnt1, stride); 2560 jcc(Assembler::less, SCAN_TO_CHAR); 2561 cmpl(cnt1, 2*stride); 2562 jcc(Assembler::less, SCAN_TO_8_CHAR_INIT); 2563 movdl(vec1, ch); 2564 vpbroadcastw(vec1, vec1, Assembler::AVX_256bit); 2565 vpxor(vec2, vec2); 2566 movl(tmp, cnt1); 2567 andl(tmp, 0xFFFFFFF0); //vector count (in chars) 2568 andl(cnt1,0x0000000F); //tail count (in chars) 2569 2570 bind(SCAN_TO_16_CHAR_LOOP); 2571 vmovdqu(vec3, Address(result, 0)); 2572 vpcmpeqw(vec3, vec3, vec1, 1); 2573 vptest(vec2, vec3); 2574 jcc(Assembler::carryClear, FOUND_CHAR); 2575 addptr(result, 32); 2576 subl(tmp, 2*stride); 2577 jcc(Assembler::notZero, SCAN_TO_16_CHAR_LOOP); 2578 jmp(SCAN_TO_8_CHAR); 2579 bind(SCAN_TO_8_CHAR_INIT); 2580 movdl(vec1, ch); 2581 pshuflw(vec1, vec1, 0x00); 2582 pshufd(vec1, vec1, 0); 2583 pxor(vec2, vec2); 2584 } 2585 bind(SCAN_TO_8_CHAR); 2586 cmpl(cnt1, stride); 2587 jcc(Assembler::less, SCAN_TO_CHAR); 2588 if (UseAVX < 2) { 2589 movdl(vec1, ch); 2590 pshuflw(vec1, vec1, 0x00); 2591 pshufd(vec1, vec1, 0); 2592 pxor(vec2, vec2); 2593 } 2594 movl(tmp, cnt1); 2595 andl(tmp, 0xFFFFFFF8); //vector count (in chars) 2596 andl(cnt1,0x00000007); //tail count (in chars) 2597 2598 bind(SCAN_TO_8_CHAR_LOOP); 2599 movdqu(vec3, Address(result, 0)); 2600 pcmpeqw(vec3, vec1); 2601 ptest(vec2, vec3); 2602 jcc(Assembler::carryClear, FOUND_CHAR); 2603 addptr(result, 16); 2604 subl(tmp, stride); 2605 jcc(Assembler::notZero, SCAN_TO_8_CHAR_LOOP); 2606 bind(SCAN_TO_CHAR); 2607 testl(cnt1, cnt1); 2608 jcc(Assembler::zero, RET_NOT_FOUND); 2609 bind(SCAN_TO_CHAR_LOOP); 2610 load_unsigned_short(tmp, Address(result, 0)); 2611 cmpl(ch, tmp); 2612 jccb(Assembler::equal, FOUND_SEQ_CHAR); 2613 addptr(result, 2); 2614 subl(cnt1, 1); 2615 jccb(Assembler::zero, RET_NOT_FOUND); 2616 jmp(SCAN_TO_CHAR_LOOP); 2617 2618 bind(RET_NOT_FOUND); 2619 movl(result, -1); 2620 jmpb(DONE_LABEL); 2621 2622 bind(FOUND_CHAR); 2623 if (UseAVX >= 2) { 2624 vpmovmskb(tmp, vec3); 2625 } else { 2626 pmovmskb(tmp, vec3); 2627 } 2628 bsfl(ch, tmp); 2629 addl(result, ch); 2630 2631 bind(FOUND_SEQ_CHAR); 2632 subptr(result, str1); 2633 shrl(result, 1); 2634 2635 bind(DONE_LABEL); 2636 } // string_indexof_char 2637 2638 // helper function for string_compare 2639 void C2_MacroAssembler::load_next_elements(Register elem1, Register elem2, Register str1, Register str2, 2640 Address::ScaleFactor scale, Address::ScaleFactor scale1, 2641 Address::ScaleFactor scale2, Register index, int ae) { 2642 if (ae == StrIntrinsicNode::LL) { 2643 load_unsigned_byte(elem1, Address(str1, index, scale, 0)); 2644 load_unsigned_byte(elem2, Address(str2, index, scale, 0)); 2645 } else if (ae == StrIntrinsicNode::UU) { 2646 load_unsigned_short(elem1, Address(str1, index, scale, 0)); 2647 load_unsigned_short(elem2, Address(str2, index, scale, 0)); 2648 } else { 2649 load_unsigned_byte(elem1, Address(str1, index, scale1, 0)); 2650 load_unsigned_short(elem2, Address(str2, index, scale2, 0)); 2651 } 2652 } 2653 2654 // Compare strings, used for char[] and byte[]. 2655 void C2_MacroAssembler::string_compare(Register str1, Register str2, 2656 Register cnt1, Register cnt2, Register result, 2657 XMMRegister vec1, int ae) { 2658 ShortBranchVerifier sbv(this); 2659 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 2660 Label COMPARE_WIDE_VECTORS_LOOP_FAILED; // used only _LP64 && AVX3 2661 int stride, stride2, adr_stride, adr_stride1, adr_stride2; 2662 int stride2x2 = 0x40; 2663 Address::ScaleFactor scale = Address::no_scale; 2664 Address::ScaleFactor scale1 = Address::no_scale; 2665 Address::ScaleFactor scale2 = Address::no_scale; 2666 2667 if (ae != StrIntrinsicNode::LL) { 2668 stride2x2 = 0x20; 2669 } 2670 2671 if (ae == StrIntrinsicNode::LU || ae == StrIntrinsicNode::UL) { 2672 shrl(cnt2, 1); 2673 } 2674 // Compute the minimum of the string lengths and the 2675 // difference of the string lengths (stack). 2676 // Do the conditional move stuff 2677 movl(result, cnt1); 2678 subl(cnt1, cnt2); 2679 push(cnt1); 2680 cmov32(Assembler::lessEqual, cnt2, result); // cnt2 = min(cnt1, cnt2) 2681 2682 // Is the minimum length zero? 2683 testl(cnt2, cnt2); 2684 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2685 if (ae == StrIntrinsicNode::LL) { 2686 // Load first bytes 2687 load_unsigned_byte(result, Address(str1, 0)); // result = str1[0] 2688 load_unsigned_byte(cnt1, Address(str2, 0)); // cnt1 = str2[0] 2689 } else if (ae == StrIntrinsicNode::UU) { 2690 // Load first characters 2691 load_unsigned_short(result, Address(str1, 0)); 2692 load_unsigned_short(cnt1, Address(str2, 0)); 2693 } else { 2694 load_unsigned_byte(result, Address(str1, 0)); 2695 load_unsigned_short(cnt1, Address(str2, 0)); 2696 } 2697 subl(result, cnt1); 2698 jcc(Assembler::notZero, POP_LABEL); 2699 2700 if (ae == StrIntrinsicNode::UU) { 2701 // Divide length by 2 to get number of chars 2702 shrl(cnt2, 1); 2703 } 2704 cmpl(cnt2, 1); 2705 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2706 2707 // Check if the strings start at the same location and setup scale and stride 2708 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2709 cmpptr(str1, str2); 2710 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 2711 if (ae == StrIntrinsicNode::LL) { 2712 scale = Address::times_1; 2713 stride = 16; 2714 } else { 2715 scale = Address::times_2; 2716 stride = 8; 2717 } 2718 } else { 2719 scale1 = Address::times_1; 2720 scale2 = Address::times_2; 2721 // scale not used 2722 stride = 8; 2723 } 2724 2725 if (UseAVX >= 2 && UseSSE42Intrinsics) { 2726 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 2727 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 2728 Label COMPARE_WIDE_VECTORS_LOOP_AVX2; 2729 Label COMPARE_TAIL_LONG; 2730 Label COMPARE_WIDE_VECTORS_LOOP_AVX3; // used only _LP64 && AVX3 2731 2732 int pcmpmask = 0x19; 2733 if (ae == StrIntrinsicNode::LL) { 2734 pcmpmask &= ~0x01; 2735 } 2736 2737 // Setup to compare 16-chars (32-bytes) vectors, 2738 // start from first character again because it has aligned address. 2739 if (ae == StrIntrinsicNode::LL) { 2740 stride2 = 32; 2741 } else { 2742 stride2 = 16; 2743 } 2744 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2745 adr_stride = stride << scale; 2746 } else { 2747 adr_stride1 = 8; //stride << scale1; 2748 adr_stride2 = 16; //stride << scale2; 2749 } 2750 2751 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2752 // rax and rdx are used by pcmpestri as elements counters 2753 movl(result, cnt2); 2754 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 2755 jcc(Assembler::zero, COMPARE_TAIL_LONG); 2756 2757 // fast path : compare first 2 8-char vectors. 2758 bind(COMPARE_16_CHARS); 2759 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2760 movdqu(vec1, Address(str1, 0)); 2761 } else { 2762 pmovzxbw(vec1, Address(str1, 0)); 2763 } 2764 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2765 jccb(Assembler::below, COMPARE_INDEX_CHAR); 2766 2767 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2768 movdqu(vec1, Address(str1, adr_stride)); 2769 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 2770 } else { 2771 pmovzxbw(vec1, Address(str1, adr_stride1)); 2772 pcmpestri(vec1, Address(str2, adr_stride2), pcmpmask); 2773 } 2774 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 2775 addl(cnt1, stride); 2776 2777 // Compare the characters at index in cnt1 2778 bind(COMPARE_INDEX_CHAR); // cnt1 has the offset of the mismatching character 2779 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2780 subl(result, cnt2); 2781 jmp(POP_LABEL); 2782 2783 // Setup the registers to start vector comparison loop 2784 bind(COMPARE_WIDE_VECTORS); 2785 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2786 lea(str1, Address(str1, result, scale)); 2787 lea(str2, Address(str2, result, scale)); 2788 } else { 2789 lea(str1, Address(str1, result, scale1)); 2790 lea(str2, Address(str2, result, scale2)); 2791 } 2792 subl(result, stride2); 2793 subl(cnt2, stride2); 2794 jcc(Assembler::zero, COMPARE_WIDE_TAIL); 2795 negptr(result); 2796 2797 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 2798 bind(COMPARE_WIDE_VECTORS_LOOP); 2799 2800 #ifdef _LP64 2801 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 2802 cmpl(cnt2, stride2x2); 2803 jccb(Assembler::below, COMPARE_WIDE_VECTORS_LOOP_AVX2); 2804 testl(cnt2, stride2x2-1); // cnt2 holds the vector count 2805 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX2); // means we cannot subtract by 0x40 2806 2807 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 2808 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2809 evmovdquq(vec1, Address(str1, result, scale), Assembler::AVX_512bit); 2810 evpcmpeqb(k7, vec1, Address(str2, result, scale), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2811 } else { 2812 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_512bit); 2813 evpcmpeqb(k7, vec1, Address(str2, result, scale2), Assembler::AVX_512bit); // k7 == 11..11, if operands equal, otherwise k7 has some 0 2814 } 2815 kortestql(k7, k7); 2816 jcc(Assembler::aboveEqual, COMPARE_WIDE_VECTORS_LOOP_FAILED); // miscompare 2817 addptr(result, stride2x2); // update since we already compared at this addr 2818 subl(cnt2, stride2x2); // and sub the size too 2819 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP_AVX3); 2820 2821 vpxor(vec1, vec1); 2822 jmpb(COMPARE_WIDE_TAIL); 2823 }//if (VM_Version::supports_avx512vlbw()) 2824 #endif // _LP64 2825 2826 2827 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2828 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2829 vmovdqu(vec1, Address(str1, result, scale)); 2830 vpxor(vec1, Address(str2, result, scale)); 2831 } else { 2832 vpmovzxbw(vec1, Address(str1, result, scale1), Assembler::AVX_256bit); 2833 vpxor(vec1, Address(str2, result, scale2)); 2834 } 2835 vptest(vec1, vec1); 2836 jcc(Assembler::notZero, VECTOR_NOT_EQUAL); 2837 addptr(result, stride2); 2838 subl(cnt2, stride2); 2839 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 2840 // clean upper bits of YMM registers 2841 vpxor(vec1, vec1); 2842 2843 // compare wide vectors tail 2844 bind(COMPARE_WIDE_TAIL); 2845 testptr(result, result); 2846 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2847 2848 movl(result, stride2); 2849 movl(cnt2, result); 2850 negptr(result); 2851 jmp(COMPARE_WIDE_VECTORS_LOOP_AVX2); 2852 2853 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 2854 bind(VECTOR_NOT_EQUAL); 2855 // clean upper bits of YMM registers 2856 vpxor(vec1, vec1); 2857 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2858 lea(str1, Address(str1, result, scale)); 2859 lea(str2, Address(str2, result, scale)); 2860 } else { 2861 lea(str1, Address(str1, result, scale1)); 2862 lea(str2, Address(str2, result, scale2)); 2863 } 2864 jmp(COMPARE_16_CHARS); 2865 2866 // Compare tail chars, length between 1 to 15 chars 2867 bind(COMPARE_TAIL_LONG); 2868 movl(cnt2, result); 2869 cmpl(cnt2, stride); 2870 jcc(Assembler::less, COMPARE_SMALL_STR); 2871 2872 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2873 movdqu(vec1, Address(str1, 0)); 2874 } else { 2875 pmovzxbw(vec1, Address(str1, 0)); 2876 } 2877 pcmpestri(vec1, Address(str2, 0), pcmpmask); 2878 jcc(Assembler::below, COMPARE_INDEX_CHAR); 2879 subptr(cnt2, stride); 2880 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2881 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2882 lea(str1, Address(str1, result, scale)); 2883 lea(str2, Address(str2, result, scale)); 2884 } else { 2885 lea(str1, Address(str1, result, scale1)); 2886 lea(str2, Address(str2, result, scale2)); 2887 } 2888 negptr(cnt2); 2889 jmpb(WHILE_HEAD_LABEL); 2890 2891 bind(COMPARE_SMALL_STR); 2892 } else if (UseSSE42Intrinsics) { 2893 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 2894 int pcmpmask = 0x19; 2895 // Setup to compare 8-char (16-byte) vectors, 2896 // start from first character again because it has aligned address. 2897 movl(result, cnt2); 2898 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 2899 if (ae == StrIntrinsicNode::LL) { 2900 pcmpmask &= ~0x01; 2901 } 2902 jcc(Assembler::zero, COMPARE_TAIL); 2903 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2904 lea(str1, Address(str1, result, scale)); 2905 lea(str2, Address(str2, result, scale)); 2906 } else { 2907 lea(str1, Address(str1, result, scale1)); 2908 lea(str2, Address(str2, result, scale2)); 2909 } 2910 negptr(result); 2911 2912 // pcmpestri 2913 // inputs: 2914 // vec1- substring 2915 // rax - negative string length (elements count) 2916 // mem - scanned string 2917 // rdx - string length (elements count) 2918 // pcmpmask - cmp mode: 11000 (string compare with negated result) 2919 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 2920 // outputs: 2921 // rcx - first mismatched element index 2922 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 2923 2924 bind(COMPARE_WIDE_VECTORS); 2925 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2926 movdqu(vec1, Address(str1, result, scale)); 2927 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2928 } else { 2929 pmovzxbw(vec1, Address(str1, result, scale1)); 2930 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2931 } 2932 // After pcmpestri cnt1(rcx) contains mismatched element index 2933 2934 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 2935 addptr(result, stride); 2936 subptr(cnt2, stride); 2937 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 2938 2939 // compare wide vectors tail 2940 testptr(result, result); 2941 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 2942 2943 movl(cnt2, stride); 2944 movl(result, stride); 2945 negptr(result); 2946 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2947 movdqu(vec1, Address(str1, result, scale)); 2948 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 2949 } else { 2950 pmovzxbw(vec1, Address(str1, result, scale1)); 2951 pcmpestri(vec1, Address(str2, result, scale2), pcmpmask); 2952 } 2953 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 2954 2955 // Mismatched characters in the vectors 2956 bind(VECTOR_NOT_EQUAL); 2957 addptr(cnt1, result); 2958 load_next_elements(result, cnt2, str1, str2, scale, scale1, scale2, cnt1, ae); 2959 subl(result, cnt2); 2960 jmpb(POP_LABEL); 2961 2962 bind(COMPARE_TAIL); // limit is zero 2963 movl(cnt2, result); 2964 // Fallthru to tail compare 2965 } 2966 // Shift str2 and str1 to the end of the arrays, negate min 2967 if (ae == StrIntrinsicNode::LL || ae == StrIntrinsicNode::UU) { 2968 lea(str1, Address(str1, cnt2, scale)); 2969 lea(str2, Address(str2, cnt2, scale)); 2970 } else { 2971 lea(str1, Address(str1, cnt2, scale1)); 2972 lea(str2, Address(str2, cnt2, scale2)); 2973 } 2974 decrementl(cnt2); // first character was compared already 2975 negptr(cnt2); 2976 2977 // Compare the rest of the elements 2978 bind(WHILE_HEAD_LABEL); 2979 load_next_elements(result, cnt1, str1, str2, scale, scale1, scale2, cnt2, ae); 2980 subl(result, cnt1); 2981 jccb(Assembler::notZero, POP_LABEL); 2982 increment(cnt2); 2983 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 2984 2985 // Strings are equal up to min length. Return the length difference. 2986 bind(LENGTH_DIFF_LABEL); 2987 pop(result); 2988 if (ae == StrIntrinsicNode::UU) { 2989 // Divide diff by 2 to get number of chars 2990 sarl(result, 1); 2991 } 2992 jmpb(DONE_LABEL); 2993 2994 #ifdef _LP64 2995 if (VM_Version::supports_avx512vlbw()) { 2996 2997 bind(COMPARE_WIDE_VECTORS_LOOP_FAILED); 2998 2999 kmovql(cnt1, k7); 3000 notq(cnt1); 3001 bsfq(cnt2, cnt1); 3002 if (ae != StrIntrinsicNode::LL) { 3003 // Divide diff by 2 to get number of chars 3004 sarl(cnt2, 1); 3005 } 3006 addq(result, cnt2); 3007 if (ae == StrIntrinsicNode::LL) { 3008 load_unsigned_byte(cnt1, Address(str2, result)); 3009 load_unsigned_byte(result, Address(str1, result)); 3010 } else if (ae == StrIntrinsicNode::UU) { 3011 load_unsigned_short(cnt1, Address(str2, result, scale)); 3012 load_unsigned_short(result, Address(str1, result, scale)); 3013 } else { 3014 load_unsigned_short(cnt1, Address(str2, result, scale2)); 3015 load_unsigned_byte(result, Address(str1, result, scale1)); 3016 } 3017 subl(result, cnt1); 3018 jmpb(POP_LABEL); 3019 }//if (VM_Version::supports_avx512vlbw()) 3020 #endif // _LP64 3021 3022 // Discard the stored length difference 3023 bind(POP_LABEL); 3024 pop(cnt1); 3025 3026 // That's it 3027 bind(DONE_LABEL); 3028 if(ae == StrIntrinsicNode::UL) { 3029 negl(result); 3030 } 3031 3032 } 3033 3034 // Search for Non-ASCII character (Negative byte value) in a byte array, 3035 // return true if it has any and false otherwise. 3036 // ..\jdk\src\java.base\share\classes\java\lang\StringCoding.java 3037 // @HotSpotIntrinsicCandidate 3038 // private static boolean hasNegatives(byte[] ba, int off, int len) { 3039 // for (int i = off; i < off + len; i++) { 3040 // if (ba[i] < 0) { 3041 // return true; 3042 // } 3043 // } 3044 // return false; 3045 // } 3046 void C2_MacroAssembler::has_negatives(Register ary1, Register len, 3047 Register result, Register tmp1, 3048 XMMRegister vec1, XMMRegister vec2) { 3049 // rsi: byte array 3050 // rcx: len 3051 // rax: result 3052 ShortBranchVerifier sbv(this); 3053 assert_different_registers(ary1, len, result, tmp1); 3054 assert_different_registers(vec1, vec2); 3055 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_CHAR, COMPARE_VECTORS, COMPARE_BYTE; 3056 3057 // len == 0 3058 testl(len, len); 3059 jcc(Assembler::zero, FALSE_LABEL); 3060 3061 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 3062 VM_Version::supports_avx512vlbw() && 3063 VM_Version::supports_bmi2()) { 3064 3065 Label test_64_loop, test_tail; 3066 Register tmp3_aliased = len; 3067 3068 movl(tmp1, len); 3069 vpxor(vec2, vec2, vec2, Assembler::AVX_512bit); 3070 3071 andl(tmp1, 64 - 1); // tail count (in chars) 0x3F 3072 andl(len, ~(64 - 1)); // vector count (in chars) 3073 jccb(Assembler::zero, test_tail); 3074 3075 lea(ary1, Address(ary1, len, Address::times_1)); 3076 negptr(len); 3077 3078 bind(test_64_loop); 3079 // Check whether our 64 elements of size byte contain negatives 3080 evpcmpgtb(k2, vec2, Address(ary1, len, Address::times_1), Assembler::AVX_512bit); 3081 kortestql(k2, k2); 3082 jcc(Assembler::notZero, TRUE_LABEL); 3083 3084 addptr(len, 64); 3085 jccb(Assembler::notZero, test_64_loop); 3086 3087 3088 bind(test_tail); 3089 // bail out when there is nothing to be done 3090 testl(tmp1, -1); 3091 jcc(Assembler::zero, FALSE_LABEL); 3092 3093 // ~(~0 << len) applied up to two times (for 32-bit scenario) 3094 #ifdef _LP64 3095 mov64(tmp3_aliased, 0xFFFFFFFFFFFFFFFF); 3096 shlxq(tmp3_aliased, tmp3_aliased, tmp1); 3097 notq(tmp3_aliased); 3098 kmovql(k3, tmp3_aliased); 3099 #else 3100 Label k_init; 3101 jmp(k_init); 3102 3103 // We could not read 64-bits from a general purpose register thus we move 3104 // data required to compose 64 1's to the instruction stream 3105 // We emit 64 byte wide series of elements from 0..63 which later on would 3106 // be used as a compare targets with tail count contained in tmp1 register. 3107 // Result would be a k register having tmp1 consecutive number or 1 3108 // counting from least significant bit. 3109 address tmp = pc(); 3110 emit_int64(0x0706050403020100); 3111 emit_int64(0x0F0E0D0C0B0A0908); 3112 emit_int64(0x1716151413121110); 3113 emit_int64(0x1F1E1D1C1B1A1918); 3114 emit_int64(0x2726252423222120); 3115 emit_int64(0x2F2E2D2C2B2A2928); 3116 emit_int64(0x3736353433323130); 3117 emit_int64(0x3F3E3D3C3B3A3938); 3118 3119 bind(k_init); 3120 lea(len, InternalAddress(tmp)); 3121 // create mask to test for negative byte inside a vector 3122 evpbroadcastb(vec1, tmp1, Assembler::AVX_512bit); 3123 evpcmpgtb(k3, vec1, Address(len, 0), Assembler::AVX_512bit); 3124 3125 #endif 3126 evpcmpgtb(k2, k3, vec2, Address(ary1, 0), Assembler::AVX_512bit); 3127 ktestq(k2, k3); 3128 jcc(Assembler::notZero, TRUE_LABEL); 3129 3130 jmp(FALSE_LABEL); 3131 } else { 3132 movl(result, len); // copy 3133 3134 if (UseAVX >= 2 && UseSSE >= 2) { 3135 // With AVX2, use 32-byte vector compare 3136 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3137 3138 // Compare 32-byte vectors 3139 andl(result, 0x0000001f); // tail count (in bytes) 3140 andl(len, 0xffffffe0); // vector count (in bytes) 3141 jccb(Assembler::zero, COMPARE_TAIL); 3142 3143 lea(ary1, Address(ary1, len, Address::times_1)); 3144 negptr(len); 3145 3146 movl(tmp1, 0x80808080); // create mask to test for Unicode chars in vector 3147 movdl(vec2, tmp1); 3148 vpbroadcastd(vec2, vec2, Assembler::AVX_256bit); 3149 3150 bind(COMPARE_WIDE_VECTORS); 3151 vmovdqu(vec1, Address(ary1, len, Address::times_1)); 3152 vptest(vec1, vec2); 3153 jccb(Assembler::notZero, TRUE_LABEL); 3154 addptr(len, 32); 3155 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3156 3157 testl(result, result); 3158 jccb(Assembler::zero, FALSE_LABEL); 3159 3160 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3161 vptest(vec1, vec2); 3162 jccb(Assembler::notZero, TRUE_LABEL); 3163 jmpb(FALSE_LABEL); 3164 3165 bind(COMPARE_TAIL); // len is zero 3166 movl(len, result); 3167 // Fallthru to tail compare 3168 } else if (UseSSE42Intrinsics) { 3169 // With SSE4.2, use double quad vector compare 3170 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3171 3172 // Compare 16-byte vectors 3173 andl(result, 0x0000000f); // tail count (in bytes) 3174 andl(len, 0xfffffff0); // vector count (in bytes) 3175 jcc(Assembler::zero, COMPARE_TAIL); 3176 3177 lea(ary1, Address(ary1, len, Address::times_1)); 3178 negptr(len); 3179 3180 movl(tmp1, 0x80808080); 3181 movdl(vec2, tmp1); 3182 pshufd(vec2, vec2, 0); 3183 3184 bind(COMPARE_WIDE_VECTORS); 3185 movdqu(vec1, Address(ary1, len, Address::times_1)); 3186 ptest(vec1, vec2); 3187 jcc(Assembler::notZero, TRUE_LABEL); 3188 addptr(len, 16); 3189 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3190 3191 testl(result, result); 3192 jcc(Assembler::zero, FALSE_LABEL); 3193 3194 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3195 ptest(vec1, vec2); 3196 jccb(Assembler::notZero, TRUE_LABEL); 3197 jmpb(FALSE_LABEL); 3198 3199 bind(COMPARE_TAIL); // len is zero 3200 movl(len, result); 3201 // Fallthru to tail compare 3202 } 3203 } 3204 // Compare 4-byte vectors 3205 andl(len, 0xfffffffc); // vector count (in bytes) 3206 jccb(Assembler::zero, COMPARE_CHAR); 3207 3208 lea(ary1, Address(ary1, len, Address::times_1)); 3209 negptr(len); 3210 3211 bind(COMPARE_VECTORS); 3212 movl(tmp1, Address(ary1, len, Address::times_1)); 3213 andl(tmp1, 0x80808080); 3214 jccb(Assembler::notZero, TRUE_LABEL); 3215 addptr(len, 4); 3216 jcc(Assembler::notZero, COMPARE_VECTORS); 3217 3218 // Compare trailing char (final 2 bytes), if any 3219 bind(COMPARE_CHAR); 3220 testl(result, 0x2); // tail char 3221 jccb(Assembler::zero, COMPARE_BYTE); 3222 load_unsigned_short(tmp1, Address(ary1, 0)); 3223 andl(tmp1, 0x00008080); 3224 jccb(Assembler::notZero, TRUE_LABEL); 3225 subptr(result, 2); 3226 lea(ary1, Address(ary1, 2)); 3227 3228 bind(COMPARE_BYTE); 3229 testl(result, 0x1); // tail byte 3230 jccb(Assembler::zero, FALSE_LABEL); 3231 load_unsigned_byte(tmp1, Address(ary1, 0)); 3232 andl(tmp1, 0x00000080); 3233 jccb(Assembler::notEqual, TRUE_LABEL); 3234 jmpb(FALSE_LABEL); 3235 3236 bind(TRUE_LABEL); 3237 movl(result, 1); // return true 3238 jmpb(DONE); 3239 3240 bind(FALSE_LABEL); 3241 xorl(result, result); // return false 3242 3243 // That's it 3244 bind(DONE); 3245 if (UseAVX >= 2 && UseSSE >= 2) { 3246 // clean upper bits of YMM registers 3247 vpxor(vec1, vec1); 3248 vpxor(vec2, vec2); 3249 } 3250 } 3251 // Compare char[] or byte[] arrays aligned to 4 bytes or substrings. 3252 void C2_MacroAssembler::arrays_equals(bool is_array_equ, Register ary1, Register ary2, 3253 Register limit, Register result, Register chr, 3254 XMMRegister vec1, XMMRegister vec2, bool is_char) { 3255 ShortBranchVerifier sbv(this); 3256 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR, COMPARE_BYTE; 3257 3258 int length_offset = arrayOopDesc::length_offset_in_bytes(); 3259 int base_offset = arrayOopDesc::base_offset_in_bytes(is_char ? T_CHAR : T_BYTE); 3260 3261 if (is_array_equ) { 3262 // Check the input args 3263 cmpoop(ary1, ary2); 3264 jcc(Assembler::equal, TRUE_LABEL); 3265 3266 // Need additional checks for arrays_equals. 3267 testptr(ary1, ary1); 3268 jcc(Assembler::zero, FALSE_LABEL); 3269 testptr(ary2, ary2); 3270 jcc(Assembler::zero, FALSE_LABEL); 3271 3272 // Check the lengths 3273 movl(limit, Address(ary1, length_offset)); 3274 cmpl(limit, Address(ary2, length_offset)); 3275 jcc(Assembler::notEqual, FALSE_LABEL); 3276 } 3277 3278 // count == 0 3279 testl(limit, limit); 3280 jcc(Assembler::zero, TRUE_LABEL); 3281 3282 if (is_array_equ) { 3283 // Load array address 3284 lea(ary1, Address(ary1, base_offset)); 3285 lea(ary2, Address(ary2, base_offset)); 3286 } 3287 3288 if (is_array_equ && is_char) { 3289 // arrays_equals when used for char[]. 3290 shll(limit, 1); // byte count != 0 3291 } 3292 movl(result, limit); // copy 3293 3294 if (UseAVX >= 2) { 3295 // With AVX2, use 32-byte vector compare 3296 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3297 3298 // Compare 32-byte vectors 3299 andl(result, 0x0000001f); // tail count (in bytes) 3300 andl(limit, 0xffffffe0); // vector count (in bytes) 3301 jcc(Assembler::zero, COMPARE_TAIL); 3302 3303 lea(ary1, Address(ary1, limit, Address::times_1)); 3304 lea(ary2, Address(ary2, limit, Address::times_1)); 3305 negptr(limit); 3306 3307 #ifdef _LP64 3308 if ((AVX3Threshold == 0) && VM_Version::supports_avx512vlbw()) { // trying 64 bytes fast loop 3309 Label COMPARE_WIDE_VECTORS_LOOP_AVX2, COMPARE_WIDE_VECTORS_LOOP_AVX3; 3310 3311 cmpl(limit, -64); 3312 jcc(Assembler::greater, COMPARE_WIDE_VECTORS_LOOP_AVX2); 3313 3314 bind(COMPARE_WIDE_VECTORS_LOOP_AVX3); // the hottest loop 3315 3316 evmovdquq(vec1, Address(ary1, limit, Address::times_1), Assembler::AVX_512bit); 3317 evpcmpeqb(k7, vec1, Address(ary2, limit, Address::times_1), Assembler::AVX_512bit); 3318 kortestql(k7, k7); 3319 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3320 addptr(limit, 64); // update since we already compared at this addr 3321 cmpl(limit, -64); 3322 jccb(Assembler::lessEqual, COMPARE_WIDE_VECTORS_LOOP_AVX3); 3323 3324 // At this point we may still need to compare -limit+result bytes. 3325 // We could execute the next two instruction and just continue via non-wide path: 3326 // cmpl(limit, 0); 3327 // jcc(Assembler::equal, COMPARE_TAIL); // true 3328 // But since we stopped at the points ary{1,2}+limit which are 3329 // not farther than 64 bytes from the ends of arrays ary{1,2}+result 3330 // (|limit| <= 32 and result < 32), 3331 // we may just compare the last 64 bytes. 3332 // 3333 addptr(result, -64); // it is safe, bc we just came from this area 3334 evmovdquq(vec1, Address(ary1, result, Address::times_1), Assembler::AVX_512bit); 3335 evpcmpeqb(k7, vec1, Address(ary2, result, Address::times_1), Assembler::AVX_512bit); 3336 kortestql(k7, k7); 3337 jcc(Assembler::aboveEqual, FALSE_LABEL); // miscompare 3338 3339 jmp(TRUE_LABEL); 3340 3341 bind(COMPARE_WIDE_VECTORS_LOOP_AVX2); 3342 3343 }//if (VM_Version::supports_avx512vlbw()) 3344 #endif //_LP64 3345 bind(COMPARE_WIDE_VECTORS); 3346 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 3347 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 3348 vpxor(vec1, vec2); 3349 3350 vptest(vec1, vec1); 3351 jcc(Assembler::notZero, FALSE_LABEL); 3352 addptr(limit, 32); 3353 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3354 3355 testl(result, result); 3356 jcc(Assembler::zero, TRUE_LABEL); 3357 3358 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 3359 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 3360 vpxor(vec1, vec2); 3361 3362 vptest(vec1, vec1); 3363 jccb(Assembler::notZero, FALSE_LABEL); 3364 jmpb(TRUE_LABEL); 3365 3366 bind(COMPARE_TAIL); // limit is zero 3367 movl(limit, result); 3368 // Fallthru to tail compare 3369 } else if (UseSSE42Intrinsics) { 3370 // With SSE4.2, use double quad vector compare 3371 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 3372 3373 // Compare 16-byte vectors 3374 andl(result, 0x0000000f); // tail count (in bytes) 3375 andl(limit, 0xfffffff0); // vector count (in bytes) 3376 jcc(Assembler::zero, COMPARE_TAIL); 3377 3378 lea(ary1, Address(ary1, limit, Address::times_1)); 3379 lea(ary2, Address(ary2, limit, Address::times_1)); 3380 negptr(limit); 3381 3382 bind(COMPARE_WIDE_VECTORS); 3383 movdqu(vec1, Address(ary1, limit, Address::times_1)); 3384 movdqu(vec2, Address(ary2, limit, Address::times_1)); 3385 pxor(vec1, vec2); 3386 3387 ptest(vec1, vec1); 3388 jcc(Assembler::notZero, FALSE_LABEL); 3389 addptr(limit, 16); 3390 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 3391 3392 testl(result, result); 3393 jcc(Assembler::zero, TRUE_LABEL); 3394 3395 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 3396 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 3397 pxor(vec1, vec2); 3398 3399 ptest(vec1, vec1); 3400 jccb(Assembler::notZero, FALSE_LABEL); 3401 jmpb(TRUE_LABEL); 3402 3403 bind(COMPARE_TAIL); // limit is zero 3404 movl(limit, result); 3405 // Fallthru to tail compare 3406 } 3407 3408 // Compare 4-byte vectors 3409 andl(limit, 0xfffffffc); // vector count (in bytes) 3410 jccb(Assembler::zero, COMPARE_CHAR); 3411 3412 lea(ary1, Address(ary1, limit, Address::times_1)); 3413 lea(ary2, Address(ary2, limit, Address::times_1)); 3414 negptr(limit); 3415 3416 bind(COMPARE_VECTORS); 3417 movl(chr, Address(ary1, limit, Address::times_1)); 3418 cmpl(chr, Address(ary2, limit, Address::times_1)); 3419 jccb(Assembler::notEqual, FALSE_LABEL); 3420 addptr(limit, 4); 3421 jcc(Assembler::notZero, COMPARE_VECTORS); 3422 3423 // Compare trailing char (final 2 bytes), if any 3424 bind(COMPARE_CHAR); 3425 testl(result, 0x2); // tail char 3426 jccb(Assembler::zero, COMPARE_BYTE); 3427 load_unsigned_short(chr, Address(ary1, 0)); 3428 load_unsigned_short(limit, Address(ary2, 0)); 3429 cmpl(chr, limit); 3430 jccb(Assembler::notEqual, FALSE_LABEL); 3431 3432 if (is_array_equ && is_char) { 3433 bind(COMPARE_BYTE); 3434 } else { 3435 lea(ary1, Address(ary1, 2)); 3436 lea(ary2, Address(ary2, 2)); 3437 3438 bind(COMPARE_BYTE); 3439 testl(result, 0x1); // tail byte 3440 jccb(Assembler::zero, TRUE_LABEL); 3441 load_unsigned_byte(chr, Address(ary1, 0)); 3442 load_unsigned_byte(limit, Address(ary2, 0)); 3443 cmpl(chr, limit); 3444 jccb(Assembler::notEqual, FALSE_LABEL); 3445 } 3446 bind(TRUE_LABEL); 3447 movl(result, 1); // return true 3448 jmpb(DONE); 3449 3450 bind(FALSE_LABEL); 3451 xorl(result, result); // return false 3452 3453 // That's it 3454 bind(DONE); 3455 if (UseAVX >= 2) { 3456 // clean upper bits of YMM registers 3457 vpxor(vec1, vec1); 3458 vpxor(vec2, vec2); 3459 } 3460 }