1 /* 2 * Copyright (c) 1997, 2020, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "jvm.h" 27 #include "asm/assembler.hpp" 28 #include "asm/assembler.inline.hpp" 29 #include "compiler/disassembler.hpp" 30 #include "gc/shared/barrierSet.hpp" 31 #include "gc/shared/barrierSetAssembler.hpp" 32 #include "gc/shared/collectedHeap.inline.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "memory/resourceArea.hpp" 35 #include "memory/universe.hpp" 36 #include "oops/accessDecorators.hpp" 37 #include "oops/compressedOops.inline.hpp" 38 #include "oops/klass.inline.hpp" 39 #include "prims/methodHandles.hpp" 40 #include "runtime/biasedLocking.hpp" 41 #include "runtime/flags/flagSetting.hpp" 42 #include "runtime/interfaceSupport.inline.hpp" 43 #include "runtime/objectMonitor.hpp" 44 #include "runtime/os.hpp" 45 #include "runtime/safepoint.hpp" 46 #include "runtime/safepointMechanism.hpp" 47 #include "runtime/sharedRuntime.hpp" 48 #include "runtime/stubRoutines.hpp" 49 #include "runtime/thread.hpp" 50 #include "utilities/macros.hpp" 51 #include "crc32c.h" 52 53 #ifdef PRODUCT 54 #define BLOCK_COMMENT(str) /* nothing */ 55 #define STOP(error) stop(error) 56 #else 57 #define BLOCK_COMMENT(str) block_comment(str) 58 #define STOP(error) block_comment(error); stop(error) 59 #endif 60 61 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 62 63 #ifdef ASSERT 64 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 65 #endif 66 67 static Assembler::Condition reverse[] = { 68 Assembler::noOverflow /* overflow = 0x0 */ , 69 Assembler::overflow /* noOverflow = 0x1 */ , 70 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 71 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 72 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 73 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 74 Assembler::above /* belowEqual = 0x6 */ , 75 Assembler::belowEqual /* above = 0x7 */ , 76 Assembler::positive /* negative = 0x8 */ , 77 Assembler::negative /* positive = 0x9 */ , 78 Assembler::noParity /* parity = 0xa */ , 79 Assembler::parity /* noParity = 0xb */ , 80 Assembler::greaterEqual /* less = 0xc */ , 81 Assembler::less /* greaterEqual = 0xd */ , 82 Assembler::greater /* lessEqual = 0xe */ , 83 Assembler::lessEqual /* greater = 0xf, */ 84 85 }; 86 87 88 // Implementation of MacroAssembler 89 90 // First all the versions that have distinct versions depending on 32/64 bit 91 // Unless the difference is trivial (1 line or so). 92 93 #ifndef _LP64 94 95 // 32bit versions 96 97 Address MacroAssembler::as_Address(AddressLiteral adr) { 98 return Address(adr.target(), adr.rspec()); 99 } 100 101 Address MacroAssembler::as_Address(ArrayAddress adr) { 102 return Address::make_array(adr); 103 } 104 105 void MacroAssembler::call_VM_leaf_base(address entry_point, 106 int number_of_arguments) { 107 call(RuntimeAddress(entry_point)); 108 increment(rsp, number_of_arguments * wordSize); 109 } 110 111 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 112 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 113 } 114 115 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 116 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 117 } 118 119 void MacroAssembler::cmpoop_raw(Address src1, jobject obj) { 120 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 121 } 122 123 void MacroAssembler::cmpoop_raw(Register src1, jobject obj) { 124 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 125 } 126 127 void MacroAssembler::cmpoop(Address src1, jobject obj) { 128 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 129 bs->obj_equals(this, src1, obj); 130 } 131 132 void MacroAssembler::cmpoop(Register src1, jobject obj) { 133 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 134 bs->obj_equals(this, src1, obj); 135 } 136 137 void MacroAssembler::extend_sign(Register hi, Register lo) { 138 // According to Intel Doc. AP-526, "Integer Divide", p.18. 139 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 140 cdql(); 141 } else { 142 movl(hi, lo); 143 sarl(hi, 31); 144 } 145 } 146 147 void MacroAssembler::jC2(Register tmp, Label& L) { 148 // set parity bit if FPU flag C2 is set (via rax) 149 save_rax(tmp); 150 fwait(); fnstsw_ax(); 151 sahf(); 152 restore_rax(tmp); 153 // branch 154 jcc(Assembler::parity, L); 155 } 156 157 void MacroAssembler::jnC2(Register tmp, Label& L) { 158 // set parity bit if FPU flag C2 is set (via rax) 159 save_rax(tmp); 160 fwait(); fnstsw_ax(); 161 sahf(); 162 restore_rax(tmp); 163 // branch 164 jcc(Assembler::noParity, L); 165 } 166 167 // 32bit can do a case table jump in one instruction but we no longer allow the base 168 // to be installed in the Address class 169 void MacroAssembler::jump(ArrayAddress entry) { 170 jmp(as_Address(entry)); 171 } 172 173 // Note: y_lo will be destroyed 174 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 175 // Long compare for Java (semantics as described in JVM spec.) 176 Label high, low, done; 177 178 cmpl(x_hi, y_hi); 179 jcc(Assembler::less, low); 180 jcc(Assembler::greater, high); 181 // x_hi is the return register 182 xorl(x_hi, x_hi); 183 cmpl(x_lo, y_lo); 184 jcc(Assembler::below, low); 185 jcc(Assembler::equal, done); 186 187 bind(high); 188 xorl(x_hi, x_hi); 189 increment(x_hi); 190 jmp(done); 191 192 bind(low); 193 xorl(x_hi, x_hi); 194 decrementl(x_hi); 195 196 bind(done); 197 } 198 199 void MacroAssembler::lea(Register dst, AddressLiteral src) { 200 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 201 } 202 203 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 204 // leal(dst, as_Address(adr)); 205 // see note in movl as to why we must use a move 206 mov_literal32(dst, (int32_t) adr.target(), adr.rspec()); 207 } 208 209 void MacroAssembler::leave() { 210 mov(rsp, rbp); 211 pop(rbp); 212 } 213 214 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 215 // Multiplication of two Java long values stored on the stack 216 // as illustrated below. Result is in rdx:rax. 217 // 218 // rsp ---> [ ?? ] \ \ 219 // .... | y_rsp_offset | 220 // [ y_lo ] / (in bytes) | x_rsp_offset 221 // [ y_hi ] | (in bytes) 222 // .... | 223 // [ x_lo ] / 224 // [ x_hi ] 225 // .... 226 // 227 // Basic idea: lo(result) = lo(x_lo * y_lo) 228 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 229 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 230 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 231 Label quick; 232 // load x_hi, y_hi and check if quick 233 // multiplication is possible 234 movl(rbx, x_hi); 235 movl(rcx, y_hi); 236 movl(rax, rbx); 237 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 238 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 239 // do full multiplication 240 // 1st step 241 mull(y_lo); // x_hi * y_lo 242 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 243 // 2nd step 244 movl(rax, x_lo); 245 mull(rcx); // x_lo * y_hi 246 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 247 // 3rd step 248 bind(quick); // note: rbx, = 0 if quick multiply! 249 movl(rax, x_lo); 250 mull(y_lo); // x_lo * y_lo 251 addl(rdx, rbx); // correct hi(x_lo * y_lo) 252 } 253 254 void MacroAssembler::lneg(Register hi, Register lo) { 255 negl(lo); 256 adcl(hi, 0); 257 negl(hi); 258 } 259 260 void MacroAssembler::lshl(Register hi, Register lo) { 261 // Java shift left long support (semantics as described in JVM spec., p.305) 262 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 263 // shift value is in rcx ! 264 assert(hi != rcx, "must not use rcx"); 265 assert(lo != rcx, "must not use rcx"); 266 const Register s = rcx; // shift count 267 const int n = BitsPerWord; 268 Label L; 269 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 270 cmpl(s, n); // if (s < n) 271 jcc(Assembler::less, L); // else (s >= n) 272 movl(hi, lo); // x := x << n 273 xorl(lo, lo); 274 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 275 bind(L); // s (mod n) < n 276 shldl(hi, lo); // x := x << s 277 shll(lo); 278 } 279 280 281 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 282 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 283 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 284 assert(hi != rcx, "must not use rcx"); 285 assert(lo != rcx, "must not use rcx"); 286 const Register s = rcx; // shift count 287 const int n = BitsPerWord; 288 Label L; 289 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 290 cmpl(s, n); // if (s < n) 291 jcc(Assembler::less, L); // else (s >= n) 292 movl(lo, hi); // x := x >> n 293 if (sign_extension) sarl(hi, 31); 294 else xorl(hi, hi); 295 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 296 bind(L); // s (mod n) < n 297 shrdl(lo, hi); // x := x >> s 298 if (sign_extension) sarl(hi); 299 else shrl(hi); 300 } 301 302 void MacroAssembler::movoop(Register dst, jobject obj) { 303 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 304 } 305 306 void MacroAssembler::movoop(Address dst, jobject obj) { 307 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 308 } 309 310 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 311 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 312 } 313 314 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 315 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 316 } 317 318 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 319 // scratch register is not used, 320 // it is defined to match parameters of 64-bit version of this method. 321 if (src.is_lval()) { 322 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 323 } else { 324 movl(dst, as_Address(src)); 325 } 326 } 327 328 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 329 movl(as_Address(dst), src); 330 } 331 332 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 333 movl(dst, as_Address(src)); 334 } 335 336 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 337 void MacroAssembler::movptr(Address dst, intptr_t src) { 338 movl(dst, src); 339 } 340 341 342 void MacroAssembler::pop_callee_saved_registers() { 343 pop(rcx); 344 pop(rdx); 345 pop(rdi); 346 pop(rsi); 347 } 348 349 void MacroAssembler::push_callee_saved_registers() { 350 push(rsi); 351 push(rdi); 352 push(rdx); 353 push(rcx); 354 } 355 356 void MacroAssembler::pushoop(jobject obj) { 357 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 358 } 359 360 void MacroAssembler::pushklass(Metadata* obj) { 361 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 362 } 363 364 void MacroAssembler::pushptr(AddressLiteral src) { 365 if (src.is_lval()) { 366 push_literal32((int32_t)src.target(), src.rspec()); 367 } else { 368 pushl(as_Address(src)); 369 } 370 } 371 372 void MacroAssembler::set_word_if_not_zero(Register dst) { 373 xorl(dst, dst); 374 set_byte_if_not_zero(dst); 375 } 376 377 static void pass_arg0(MacroAssembler* masm, Register arg) { 378 masm->push(arg); 379 } 380 381 static void pass_arg1(MacroAssembler* masm, Register arg) { 382 masm->push(arg); 383 } 384 385 static void pass_arg2(MacroAssembler* masm, Register arg) { 386 masm->push(arg); 387 } 388 389 static void pass_arg3(MacroAssembler* masm, Register arg) { 390 masm->push(arg); 391 } 392 393 #ifndef PRODUCT 394 extern "C" void findpc(intptr_t x); 395 #endif 396 397 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 398 // In order to get locks to work, we need to fake a in_VM state 399 JavaThread* thread = JavaThread::current(); 400 JavaThreadState saved_state = thread->thread_state(); 401 thread->set_thread_state(_thread_in_vm); 402 if (ShowMessageBoxOnError) { 403 JavaThread* thread = JavaThread::current(); 404 JavaThreadState saved_state = thread->thread_state(); 405 thread->set_thread_state(_thread_in_vm); 406 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 407 ttyLocker ttyl; 408 BytecodeCounter::print(); 409 } 410 // To see where a verify_oop failed, get $ebx+40/X for this frame. 411 // This is the value of eip which points to where verify_oop will return. 412 if (os::message_box(msg, "Execution stopped, print registers?")) { 413 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 414 BREAKPOINT; 415 } 416 } 417 fatal("DEBUG MESSAGE: %s", msg); 418 } 419 420 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 421 ttyLocker ttyl; 422 FlagSetting fs(Debugging, true); 423 tty->print_cr("eip = 0x%08x", eip); 424 #ifndef PRODUCT 425 if ((WizardMode || Verbose) && PrintMiscellaneous) { 426 tty->cr(); 427 findpc(eip); 428 tty->cr(); 429 } 430 #endif 431 #define PRINT_REG(rax) \ 432 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 433 PRINT_REG(rax); 434 PRINT_REG(rbx); 435 PRINT_REG(rcx); 436 PRINT_REG(rdx); 437 PRINT_REG(rdi); 438 PRINT_REG(rsi); 439 PRINT_REG(rbp); 440 PRINT_REG(rsp); 441 #undef PRINT_REG 442 // Print some words near top of staack. 443 int* dump_sp = (int*) rsp; 444 for (int col1 = 0; col1 < 8; col1++) { 445 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 446 os::print_location(tty, *dump_sp++); 447 } 448 for (int row = 0; row < 16; row++) { 449 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 450 for (int col = 0; col < 8; col++) { 451 tty->print(" 0x%08x", *dump_sp++); 452 } 453 tty->cr(); 454 } 455 // Print some instructions around pc: 456 Disassembler::decode((address)eip-64, (address)eip); 457 tty->print_cr("--------"); 458 Disassembler::decode((address)eip, (address)eip+32); 459 } 460 461 void MacroAssembler::stop(const char* msg) { 462 ExternalAddress message((address)msg); 463 // push address of message 464 pushptr(message.addr()); 465 { Label L; call(L, relocInfo::none); bind(L); } // push eip 466 pusha(); // push registers 467 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 468 hlt(); 469 } 470 471 void MacroAssembler::warn(const char* msg) { 472 push_CPU_state(); 473 474 ExternalAddress message((address) msg); 475 // push address of message 476 pushptr(message.addr()); 477 478 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 479 addl(rsp, wordSize); // discard argument 480 pop_CPU_state(); 481 } 482 483 void MacroAssembler::print_state() { 484 { Label L; call(L, relocInfo::none); bind(L); } // push eip 485 pusha(); // push registers 486 487 push_CPU_state(); 488 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 489 pop_CPU_state(); 490 491 popa(); 492 addl(rsp, wordSize); 493 } 494 495 #else // _LP64 496 497 // 64 bit versions 498 499 Address MacroAssembler::as_Address(AddressLiteral adr) { 500 // amd64 always does this as a pc-rel 501 // we can be absolute or disp based on the instruction type 502 // jmp/call are displacements others are absolute 503 assert(!adr.is_lval(), "must be rval"); 504 assert(reachable(adr), "must be"); 505 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc()); 506 507 } 508 509 Address MacroAssembler::as_Address(ArrayAddress adr) { 510 AddressLiteral base = adr.base(); 511 lea(rscratch1, base); 512 Address index = adr.index(); 513 assert(index._disp == 0, "must not have disp"); // maybe it can? 514 Address array(rscratch1, index._index, index._scale, index._disp); 515 return array; 516 } 517 518 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 519 Label L, E; 520 521 #ifdef _WIN64 522 // Windows always allocates space for it's register args 523 assert(num_args <= 4, "only register arguments supported"); 524 subq(rsp, frame::arg_reg_save_area_bytes); 525 #endif 526 527 // Align stack if necessary 528 testl(rsp, 15); 529 jcc(Assembler::zero, L); 530 531 subq(rsp, 8); 532 { 533 call(RuntimeAddress(entry_point)); 534 } 535 addq(rsp, 8); 536 jmp(E); 537 538 bind(L); 539 { 540 call(RuntimeAddress(entry_point)); 541 } 542 543 bind(E); 544 545 #ifdef _WIN64 546 // restore stack pointer 547 addq(rsp, frame::arg_reg_save_area_bytes); 548 #endif 549 550 } 551 552 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) { 553 assert(!src2.is_lval(), "should use cmpptr"); 554 555 if (reachable(src2)) { 556 cmpq(src1, as_Address(src2)); 557 } else { 558 lea(rscratch1, src2); 559 Assembler::cmpq(src1, Address(rscratch1, 0)); 560 } 561 } 562 563 int MacroAssembler::corrected_idivq(Register reg) { 564 // Full implementation of Java ldiv and lrem; checks for special 565 // case as described in JVM spec., p.243 & p.271. The function 566 // returns the (pc) offset of the idivl instruction - may be needed 567 // for implicit exceptions. 568 // 569 // normal case special case 570 // 571 // input : rax: dividend min_long 572 // reg: divisor (may not be eax/edx) -1 573 // 574 // output: rax: quotient (= rax idiv reg) min_long 575 // rdx: remainder (= rax irem reg) 0 576 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 577 static const int64_t min_long = 0x8000000000000000; 578 Label normal_case, special_case; 579 580 // check for special case 581 cmp64(rax, ExternalAddress((address) &min_long)); 582 jcc(Assembler::notEqual, normal_case); 583 xorl(rdx, rdx); // prepare rdx for possible special case (where 584 // remainder = 0) 585 cmpq(reg, -1); 586 jcc(Assembler::equal, special_case); 587 588 // handle normal case 589 bind(normal_case); 590 cdqq(); 591 int idivq_offset = offset(); 592 idivq(reg); 593 594 // normal and special case exit 595 bind(special_case); 596 597 return idivq_offset; 598 } 599 600 void MacroAssembler::decrementq(Register reg, int value) { 601 if (value == min_jint) { subq(reg, value); return; } 602 if (value < 0) { incrementq(reg, -value); return; } 603 if (value == 0) { ; return; } 604 if (value == 1 && UseIncDec) { decq(reg) ; return; } 605 /* else */ { subq(reg, value) ; return; } 606 } 607 608 void MacroAssembler::decrementq(Address dst, int value) { 609 if (value == min_jint) { subq(dst, value); return; } 610 if (value < 0) { incrementq(dst, -value); return; } 611 if (value == 0) { ; return; } 612 if (value == 1 && UseIncDec) { decq(dst) ; return; } 613 /* else */ { subq(dst, value) ; return; } 614 } 615 616 void MacroAssembler::incrementq(AddressLiteral dst) { 617 if (reachable(dst)) { 618 incrementq(as_Address(dst)); 619 } else { 620 lea(rscratch1, dst); 621 incrementq(Address(rscratch1, 0)); 622 } 623 } 624 625 void MacroAssembler::incrementq(Register reg, int value) { 626 if (value == min_jint) { addq(reg, value); return; } 627 if (value < 0) { decrementq(reg, -value); return; } 628 if (value == 0) { ; return; } 629 if (value == 1 && UseIncDec) { incq(reg) ; return; } 630 /* else */ { addq(reg, value) ; return; } 631 } 632 633 void MacroAssembler::incrementq(Address dst, int value) { 634 if (value == min_jint) { addq(dst, value); return; } 635 if (value < 0) { decrementq(dst, -value); return; } 636 if (value == 0) { ; return; } 637 if (value == 1 && UseIncDec) { incq(dst) ; return; } 638 /* else */ { addq(dst, value) ; return; } 639 } 640 641 // 32bit can do a case table jump in one instruction but we no longer allow the base 642 // to be installed in the Address class 643 void MacroAssembler::jump(ArrayAddress entry) { 644 lea(rscratch1, entry.base()); 645 Address dispatch = entry.index(); 646 assert(dispatch._base == noreg, "must be"); 647 dispatch._base = rscratch1; 648 jmp(dispatch); 649 } 650 651 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 652 ShouldNotReachHere(); // 64bit doesn't use two regs 653 cmpq(x_lo, y_lo); 654 } 655 656 void MacroAssembler::lea(Register dst, AddressLiteral src) { 657 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 658 } 659 660 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 661 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec()); 662 movptr(dst, rscratch1); 663 } 664 665 void MacroAssembler::leave() { 666 // %%% is this really better? Why not on 32bit too? 667 emit_int8((unsigned char)0xC9); // LEAVE 668 } 669 670 void MacroAssembler::lneg(Register hi, Register lo) { 671 ShouldNotReachHere(); // 64bit doesn't use two regs 672 negq(lo); 673 } 674 675 void MacroAssembler::movoop(Register dst, jobject obj) { 676 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 677 } 678 679 void MacroAssembler::movoop(Address dst, jobject obj) { 680 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 681 movq(dst, rscratch1); 682 } 683 684 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 685 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 686 } 687 688 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 689 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 690 movq(dst, rscratch1); 691 } 692 693 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 694 if (src.is_lval()) { 695 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 696 } else { 697 if (reachable(src)) { 698 movq(dst, as_Address(src)); 699 } else { 700 lea(scratch, src); 701 movq(dst, Address(scratch, 0)); 702 } 703 } 704 } 705 706 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 707 movq(as_Address(dst), src); 708 } 709 710 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 711 movq(dst, as_Address(src)); 712 } 713 714 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 715 void MacroAssembler::movptr(Address dst, intptr_t src) { 716 mov64(rscratch1, src); 717 movq(dst, rscratch1); 718 } 719 720 // These are mostly for initializing NULL 721 void MacroAssembler::movptr(Address dst, int32_t src) { 722 movslq(dst, src); 723 } 724 725 void MacroAssembler::movptr(Register dst, int32_t src) { 726 mov64(dst, (intptr_t)src); 727 } 728 729 void MacroAssembler::pushoop(jobject obj) { 730 movoop(rscratch1, obj); 731 push(rscratch1); 732 } 733 734 void MacroAssembler::pushklass(Metadata* obj) { 735 mov_metadata(rscratch1, obj); 736 push(rscratch1); 737 } 738 739 void MacroAssembler::pushptr(AddressLiteral src) { 740 lea(rscratch1, src); 741 if (src.is_lval()) { 742 push(rscratch1); 743 } else { 744 pushq(Address(rscratch1, 0)); 745 } 746 } 747 748 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 749 // we must set sp to zero to clear frame 750 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 751 // must clear fp, so that compiled frames are not confused; it is 752 // possible that we need it only for debugging 753 if (clear_fp) { 754 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 755 } 756 757 // Always clear the pc because it could have been set by make_walkable() 758 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 759 vzeroupper(); 760 } 761 762 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 763 Register last_java_fp, 764 address last_java_pc) { 765 vzeroupper(); 766 // determine last_java_sp register 767 if (!last_java_sp->is_valid()) { 768 last_java_sp = rsp; 769 } 770 771 // last_java_fp is optional 772 if (last_java_fp->is_valid()) { 773 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), 774 last_java_fp); 775 } 776 777 // last_java_pc is optional 778 if (last_java_pc != NULL) { 779 Address java_pc(r15_thread, 780 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 781 lea(rscratch1, InternalAddress(last_java_pc)); 782 movptr(java_pc, rscratch1); 783 } 784 785 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 786 } 787 788 static void pass_arg0(MacroAssembler* masm, Register arg) { 789 if (c_rarg0 != arg ) { 790 masm->mov(c_rarg0, arg); 791 } 792 } 793 794 static void pass_arg1(MacroAssembler* masm, Register arg) { 795 if (c_rarg1 != arg ) { 796 masm->mov(c_rarg1, arg); 797 } 798 } 799 800 static void pass_arg2(MacroAssembler* masm, Register arg) { 801 if (c_rarg2 != arg ) { 802 masm->mov(c_rarg2, arg); 803 } 804 } 805 806 static void pass_arg3(MacroAssembler* masm, Register arg) { 807 if (c_rarg3 != arg ) { 808 masm->mov(c_rarg3, arg); 809 } 810 } 811 812 void MacroAssembler::stop(const char* msg) { 813 if (ShowMessageBoxOnError) { 814 address rip = pc(); 815 pusha(); // get regs on stack 816 lea(c_rarg1, InternalAddress(rip)); 817 movq(c_rarg2, rsp); // pass pointer to regs array 818 } 819 lea(c_rarg0, ExternalAddress((address) msg)); 820 andq(rsp, -16); // align stack as required by ABI 821 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 822 hlt(); 823 } 824 825 void MacroAssembler::warn(const char* msg) { 826 push(rbp); 827 movq(rbp, rsp); 828 andq(rsp, -16); // align stack as required by push_CPU_state and call 829 push_CPU_state(); // keeps alignment at 16 bytes 830 lea(c_rarg0, ExternalAddress((address) msg)); 831 lea(rax, ExternalAddress(CAST_FROM_FN_PTR(address, warning))); 832 call(rax); 833 pop_CPU_state(); 834 mov(rsp, rbp); 835 pop(rbp); 836 } 837 838 void MacroAssembler::print_state() { 839 address rip = pc(); 840 pusha(); // get regs on stack 841 push(rbp); 842 movq(rbp, rsp); 843 andq(rsp, -16); // align stack as required by push_CPU_state and call 844 push_CPU_state(); // keeps alignment at 16 bytes 845 846 lea(c_rarg0, InternalAddress(rip)); 847 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 848 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 849 850 pop_CPU_state(); 851 mov(rsp, rbp); 852 pop(rbp); 853 popa(); 854 } 855 856 #ifndef PRODUCT 857 extern "C" void findpc(intptr_t x); 858 #endif 859 860 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 861 // In order to get locks to work, we need to fake a in_VM state 862 if (ShowMessageBoxOnError) { 863 JavaThread* thread = JavaThread::current(); 864 JavaThreadState saved_state = thread->thread_state(); 865 thread->set_thread_state(_thread_in_vm); 866 #ifndef PRODUCT 867 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 868 ttyLocker ttyl; 869 BytecodeCounter::print(); 870 } 871 #endif 872 // To see where a verify_oop failed, get $ebx+40/X for this frame. 873 // XXX correct this offset for amd64 874 // This is the value of eip which points to where verify_oop will return. 875 if (os::message_box(msg, "Execution stopped, print registers?")) { 876 print_state64(pc, regs); 877 BREAKPOINT; 878 } 879 } 880 fatal("DEBUG MESSAGE: %s", msg); 881 } 882 883 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 884 ttyLocker ttyl; 885 FlagSetting fs(Debugging, true); 886 tty->print_cr("rip = 0x%016lx", (intptr_t)pc); 887 #ifndef PRODUCT 888 tty->cr(); 889 findpc(pc); 890 tty->cr(); 891 #endif 892 #define PRINT_REG(rax, value) \ 893 { tty->print("%s = ", #rax); os::print_location(tty, value); } 894 PRINT_REG(rax, regs[15]); 895 PRINT_REG(rbx, regs[12]); 896 PRINT_REG(rcx, regs[14]); 897 PRINT_REG(rdx, regs[13]); 898 PRINT_REG(rdi, regs[8]); 899 PRINT_REG(rsi, regs[9]); 900 PRINT_REG(rbp, regs[10]); 901 // rsp is actually not stored by pusha(), compute the old rsp from regs (rsp after pusha): regs + 16 = old rsp 902 PRINT_REG(rsp, (intptr_t)(®s[16])); 903 PRINT_REG(r8 , regs[7]); 904 PRINT_REG(r9 , regs[6]); 905 PRINT_REG(r10, regs[5]); 906 PRINT_REG(r11, regs[4]); 907 PRINT_REG(r12, regs[3]); 908 PRINT_REG(r13, regs[2]); 909 PRINT_REG(r14, regs[1]); 910 PRINT_REG(r15, regs[0]); 911 #undef PRINT_REG 912 // Print some words near the top of the stack. 913 int64_t* rsp = ®s[16]; 914 int64_t* dump_sp = rsp; 915 for (int col1 = 0; col1 < 8; col1++) { 916 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 917 os::print_location(tty, *dump_sp++); 918 } 919 for (int row = 0; row < 25; row++) { 920 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 921 for (int col = 0; col < 4; col++) { 922 tty->print(" 0x%016lx", (intptr_t)*dump_sp++); 923 } 924 tty->cr(); 925 } 926 // Print some instructions around pc: 927 Disassembler::decode((address)pc-64, (address)pc); 928 tty->print_cr("--------"); 929 Disassembler::decode((address)pc, (address)pc+32); 930 } 931 932 #endif // _LP64 933 934 // Now versions that are common to 32/64 bit 935 936 void MacroAssembler::addptr(Register dst, int32_t imm32) { 937 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 938 } 939 940 void MacroAssembler::addptr(Register dst, Register src) { 941 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 942 } 943 944 void MacroAssembler::addptr(Address dst, Register src) { 945 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 946 } 947 948 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) { 949 if (reachable(src)) { 950 Assembler::addsd(dst, as_Address(src)); 951 } else { 952 lea(rscratch1, src); 953 Assembler::addsd(dst, Address(rscratch1, 0)); 954 } 955 } 956 957 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) { 958 if (reachable(src)) { 959 addss(dst, as_Address(src)); 960 } else { 961 lea(rscratch1, src); 962 addss(dst, Address(rscratch1, 0)); 963 } 964 } 965 966 void MacroAssembler::addpd(XMMRegister dst, AddressLiteral src) { 967 if (reachable(src)) { 968 Assembler::addpd(dst, as_Address(src)); 969 } else { 970 lea(rscratch1, src); 971 Assembler::addpd(dst, Address(rscratch1, 0)); 972 } 973 } 974 975 void MacroAssembler::align(int modulus) { 976 align(modulus, offset()); 977 } 978 979 void MacroAssembler::align(int modulus, int target) { 980 if (target % modulus != 0) { 981 nop(modulus - (target % modulus)); 982 } 983 } 984 985 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 986 // Used in sign-masking with aligned address. 987 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 988 if (reachable(src)) { 989 Assembler::andpd(dst, as_Address(src)); 990 } else { 991 lea(scratch_reg, src); 992 Assembler::andpd(dst, Address(scratch_reg, 0)); 993 } 994 } 995 996 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 997 // Used in sign-masking with aligned address. 998 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 999 if (reachable(src)) { 1000 Assembler::andps(dst, as_Address(src)); 1001 } else { 1002 lea(scratch_reg, src); 1003 Assembler::andps(dst, Address(scratch_reg, 0)); 1004 } 1005 } 1006 1007 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1008 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1009 } 1010 1011 void MacroAssembler::atomic_incl(Address counter_addr) { 1012 lock(); 1013 incrementl(counter_addr); 1014 } 1015 1016 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) { 1017 if (reachable(counter_addr)) { 1018 atomic_incl(as_Address(counter_addr)); 1019 } else { 1020 lea(scr, counter_addr); 1021 atomic_incl(Address(scr, 0)); 1022 } 1023 } 1024 1025 #ifdef _LP64 1026 void MacroAssembler::atomic_incq(Address counter_addr) { 1027 lock(); 1028 incrementq(counter_addr); 1029 } 1030 1031 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) { 1032 if (reachable(counter_addr)) { 1033 atomic_incq(as_Address(counter_addr)); 1034 } else { 1035 lea(scr, counter_addr); 1036 atomic_incq(Address(scr, 0)); 1037 } 1038 } 1039 #endif 1040 1041 // Writes to stack successive pages until offset reached to check for 1042 // stack overflow + shadow pages. This clobbers tmp. 1043 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1044 movptr(tmp, rsp); 1045 // Bang stack for total size given plus shadow page size. 1046 // Bang one page at a time because large size can bang beyond yellow and 1047 // red zones. 1048 Label loop; 1049 bind(loop); 1050 movl(Address(tmp, (-os::vm_page_size())), size ); 1051 subptr(tmp, os::vm_page_size()); 1052 subl(size, os::vm_page_size()); 1053 jcc(Assembler::greater, loop); 1054 1055 // Bang down shadow pages too. 1056 // At this point, (tmp-0) is the last address touched, so don't 1057 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1058 // was post-decremented.) Skip this address by starting at i=1, and 1059 // touch a few more pages below. N.B. It is important to touch all 1060 // the way down including all pages in the shadow zone. 1061 for (int i = 1; i < ((int)JavaThread::stack_shadow_zone_size() / os::vm_page_size()); i++) { 1062 // this could be any sized move but this is can be a debugging crumb 1063 // so the bigger the better. 1064 movptr(Address(tmp, (-i*os::vm_page_size())), size ); 1065 } 1066 } 1067 1068 void MacroAssembler::reserved_stack_check() { 1069 // testing if reserved zone needs to be enabled 1070 Label no_reserved_zone_enabling; 1071 Register thread = NOT_LP64(rsi) LP64_ONLY(r15_thread); 1072 NOT_LP64(get_thread(rsi);) 1073 1074 cmpptr(rsp, Address(thread, JavaThread::reserved_stack_activation_offset())); 1075 jcc(Assembler::below, no_reserved_zone_enabling); 1076 1077 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), thread); 1078 jump(RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 1079 should_not_reach_here(); 1080 1081 bind(no_reserved_zone_enabling); 1082 } 1083 1084 int MacroAssembler::biased_locking_enter(Register lock_reg, 1085 Register obj_reg, 1086 Register swap_reg, 1087 Register tmp_reg, 1088 Register tmp_reg2, 1089 bool swap_reg_contains_mark, 1090 Label& done, 1091 Label* slow_case, 1092 BiasedLockingCounters* counters) { 1093 assert(UseBiasedLocking, "why call this otherwise?"); 1094 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); 1095 assert(tmp_reg != noreg, "tmp_reg must be supplied"); 1096 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); 1097 assert(markWord::age_shift == markWord::lock_bits + markWord::biased_lock_bits, "biased locking makes assumptions about bit layout"); 1098 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 1099 NOT_LP64( Address saved_mark_addr(lock_reg, 0); ) 1100 1101 if (PrintBiasedLockingStatistics && counters == NULL) { 1102 counters = BiasedLocking::counters(); 1103 } 1104 // Biased locking 1105 // See whether the lock is currently biased toward our thread and 1106 // whether the epoch is still valid 1107 // Note that the runtime guarantees sufficient alignment of JavaThread 1108 // pointers to allow age to be placed into low bits 1109 // First check to see whether biasing is even enabled for this object 1110 Label cas_label; 1111 int null_check_offset = -1; 1112 if (!swap_reg_contains_mark) { 1113 null_check_offset = offset(); 1114 movptr(swap_reg, mark_addr); 1115 } 1116 movptr(tmp_reg, swap_reg); 1117 andptr(tmp_reg, markWord::biased_lock_mask_in_place); 1118 cmpptr(tmp_reg, markWord::biased_lock_pattern); 1119 jcc(Assembler::notEqual, cas_label); 1120 // The bias pattern is present in the object's header. Need to check 1121 // whether the bias owner and the epoch are both still current. 1122 #ifndef _LP64 1123 // Note that because there is no current thread register on x86_32 we 1124 // need to store off the mark word we read out of the object to 1125 // avoid reloading it and needing to recheck invariants below. This 1126 // store is unfortunate but it makes the overall code shorter and 1127 // simpler. 1128 movptr(saved_mark_addr, swap_reg); 1129 #endif 1130 if (swap_reg_contains_mark) { 1131 null_check_offset = offset(); 1132 } 1133 load_prototype_header(tmp_reg, obj_reg, tmp_reg2); 1134 #ifdef _LP64 1135 orptr(tmp_reg, r15_thread); 1136 xorptr(tmp_reg, swap_reg); 1137 Register header_reg = tmp_reg; 1138 #else 1139 xorptr(tmp_reg, swap_reg); 1140 get_thread(swap_reg); 1141 xorptr(swap_reg, tmp_reg); 1142 Register header_reg = swap_reg; 1143 #endif 1144 andptr(header_reg, ~((int) markWord::age_mask_in_place)); 1145 if (counters != NULL) { 1146 cond_inc32(Assembler::zero, 1147 ExternalAddress((address) counters->biased_lock_entry_count_addr())); 1148 } 1149 jcc(Assembler::equal, done); 1150 1151 Label try_revoke_bias; 1152 Label try_rebias; 1153 1154 // At this point we know that the header has the bias pattern and 1155 // that we are not the bias owner in the current epoch. We need to 1156 // figure out more details about the state of the header in order to 1157 // know what operations can be legally performed on the object's 1158 // header. 1159 1160 // If the low three bits in the xor result aren't clear, that means 1161 // the prototype header is no longer biased and we have to revoke 1162 // the bias on this object. 1163 testptr(header_reg, markWord::biased_lock_mask_in_place); 1164 jccb(Assembler::notZero, try_revoke_bias); 1165 1166 // Biasing is still enabled for this data type. See whether the 1167 // epoch of the current bias is still valid, meaning that the epoch 1168 // bits of the mark word are equal to the epoch bits of the 1169 // prototype header. (Note that the prototype header's epoch bits 1170 // only change at a safepoint.) If not, attempt to rebias the object 1171 // toward the current thread. Note that we must be absolutely sure 1172 // that the current epoch is invalid in order to do this because 1173 // otherwise the manipulations it performs on the mark word are 1174 // illegal. 1175 testptr(header_reg, markWord::epoch_mask_in_place); 1176 jccb(Assembler::notZero, try_rebias); 1177 1178 // The epoch of the current bias is still valid but we know nothing 1179 // about the owner; it might be set or it might be clear. Try to 1180 // acquire the bias of the object using an atomic operation. If this 1181 // fails we will go in to the runtime to revoke the object's bias. 1182 // Note that we first construct the presumed unbiased header so we 1183 // don't accidentally blow away another thread's valid bias. 1184 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1185 andptr(swap_reg, 1186 markWord::biased_lock_mask_in_place | markWord::age_mask_in_place | markWord::epoch_mask_in_place); 1187 #ifdef _LP64 1188 movptr(tmp_reg, swap_reg); 1189 orptr(tmp_reg, r15_thread); 1190 #else 1191 get_thread(tmp_reg); 1192 orptr(tmp_reg, swap_reg); 1193 #endif 1194 lock(); 1195 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1196 // If the biasing toward our thread failed, this means that 1197 // another thread succeeded in biasing it toward itself and we 1198 // need to revoke that bias. The revocation will occur in the 1199 // interpreter runtime in the slow case. 1200 if (counters != NULL) { 1201 cond_inc32(Assembler::zero, 1202 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); 1203 } 1204 if (slow_case != NULL) { 1205 jcc(Assembler::notZero, *slow_case); 1206 } 1207 jmp(done); 1208 1209 bind(try_rebias); 1210 // At this point we know the epoch has expired, meaning that the 1211 // current "bias owner", if any, is actually invalid. Under these 1212 // circumstances _only_, we are allowed to use the current header's 1213 // value as the comparison value when doing the cas to acquire the 1214 // bias in the current epoch. In other words, we allow transfer of 1215 // the bias from one thread to another directly in this situation. 1216 // 1217 // FIXME: due to a lack of registers we currently blow away the age 1218 // bits in this situation. Should attempt to preserve them. 1219 load_prototype_header(tmp_reg, obj_reg, tmp_reg2); 1220 #ifdef _LP64 1221 orptr(tmp_reg, r15_thread); 1222 #else 1223 get_thread(swap_reg); 1224 orptr(tmp_reg, swap_reg); 1225 movptr(swap_reg, saved_mark_addr); 1226 #endif 1227 lock(); 1228 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1229 // If the biasing toward our thread failed, then another thread 1230 // succeeded in biasing it toward itself and we need to revoke that 1231 // bias. The revocation will occur in the runtime in the slow case. 1232 if (counters != NULL) { 1233 cond_inc32(Assembler::zero, 1234 ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); 1235 } 1236 if (slow_case != NULL) { 1237 jcc(Assembler::notZero, *slow_case); 1238 } 1239 jmp(done); 1240 1241 bind(try_revoke_bias); 1242 // The prototype mark in the klass doesn't have the bias bit set any 1243 // more, indicating that objects of this data type are not supposed 1244 // to be biased any more. We are going to try to reset the mark of 1245 // this object to the prototype value and fall through to the 1246 // CAS-based locking scheme. Note that if our CAS fails, it means 1247 // that another thread raced us for the privilege of revoking the 1248 // bias of this particular object, so it's okay to continue in the 1249 // normal locking code. 1250 // 1251 // FIXME: due to a lack of registers we currently blow away the age 1252 // bits in this situation. Should attempt to preserve them. 1253 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1254 load_prototype_header(tmp_reg, obj_reg, tmp_reg2); 1255 lock(); 1256 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1257 // Fall through to the normal CAS-based lock, because no matter what 1258 // the result of the above CAS, some thread must have succeeded in 1259 // removing the bias bit from the object's header. 1260 if (counters != NULL) { 1261 cond_inc32(Assembler::zero, 1262 ExternalAddress((address) counters->revoked_lock_entry_count_addr())); 1263 } 1264 1265 bind(cas_label); 1266 1267 return null_check_offset; 1268 } 1269 1270 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 1271 assert(UseBiasedLocking, "why call this otherwise?"); 1272 1273 // Check for biased locking unlock case, which is a no-op 1274 // Note: we do not have to check the thread ID for two reasons. 1275 // First, the interpreter checks for IllegalMonitorStateException at 1276 // a higher level. Second, if the bias was revoked while we held the 1277 // lock, the object could not be rebiased toward another thread, so 1278 // the bias bit would be clear. 1279 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 1280 andptr(temp_reg, markWord::biased_lock_mask_in_place); 1281 cmpptr(temp_reg, markWord::biased_lock_pattern); 1282 jcc(Assembler::equal, done); 1283 } 1284 1285 void MacroAssembler::c2bool(Register x) { 1286 // implements x == 0 ? 0 : 1 1287 // note: must only look at least-significant byte of x 1288 // since C-style booleans are stored in one byte 1289 // only! (was bug) 1290 andl(x, 0xFF); 1291 setb(Assembler::notZero, x); 1292 } 1293 1294 // Wouldn't need if AddressLiteral version had new name 1295 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 1296 Assembler::call(L, rtype); 1297 } 1298 1299 void MacroAssembler::call(Register entry) { 1300 Assembler::call(entry); 1301 } 1302 1303 void MacroAssembler::call(AddressLiteral entry) { 1304 if (reachable(entry)) { 1305 Assembler::call_literal(entry.target(), entry.rspec()); 1306 } else { 1307 lea(rscratch1, entry); 1308 Assembler::call(rscratch1); 1309 } 1310 } 1311 1312 void MacroAssembler::ic_call(address entry, jint method_index) { 1313 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 1314 movptr(rax, (intptr_t)Universe::non_oop_word()); 1315 call(AddressLiteral(entry, rh)); 1316 } 1317 1318 // Implementation of call_VM versions 1319 1320 void MacroAssembler::call_VM(Register oop_result, 1321 address entry_point, 1322 bool check_exceptions) { 1323 Label C, E; 1324 call(C, relocInfo::none); 1325 jmp(E); 1326 1327 bind(C); 1328 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 1329 ret(0); 1330 1331 bind(E); 1332 } 1333 1334 void MacroAssembler::call_VM(Register oop_result, 1335 address entry_point, 1336 Register arg_1, 1337 bool check_exceptions) { 1338 Label C, E; 1339 call(C, relocInfo::none); 1340 jmp(E); 1341 1342 bind(C); 1343 pass_arg1(this, arg_1); 1344 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 1345 ret(0); 1346 1347 bind(E); 1348 } 1349 1350 void MacroAssembler::call_VM(Register oop_result, 1351 address entry_point, 1352 Register arg_1, 1353 Register arg_2, 1354 bool check_exceptions) { 1355 Label C, E; 1356 call(C, relocInfo::none); 1357 jmp(E); 1358 1359 bind(C); 1360 1361 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1362 1363 pass_arg2(this, arg_2); 1364 pass_arg1(this, arg_1); 1365 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 1366 ret(0); 1367 1368 bind(E); 1369 } 1370 1371 void MacroAssembler::call_VM(Register oop_result, 1372 address entry_point, 1373 Register arg_1, 1374 Register arg_2, 1375 Register arg_3, 1376 bool check_exceptions) { 1377 Label C, E; 1378 call(C, relocInfo::none); 1379 jmp(E); 1380 1381 bind(C); 1382 1383 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1384 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1385 pass_arg3(this, arg_3); 1386 1387 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1388 pass_arg2(this, arg_2); 1389 1390 pass_arg1(this, arg_1); 1391 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 1392 ret(0); 1393 1394 bind(E); 1395 } 1396 1397 void MacroAssembler::call_VM(Register oop_result, 1398 Register last_java_sp, 1399 address entry_point, 1400 int number_of_arguments, 1401 bool check_exceptions) { 1402 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1403 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1404 } 1405 1406 void MacroAssembler::call_VM(Register oop_result, 1407 Register last_java_sp, 1408 address entry_point, 1409 Register arg_1, 1410 bool check_exceptions) { 1411 pass_arg1(this, arg_1); 1412 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1413 } 1414 1415 void MacroAssembler::call_VM(Register oop_result, 1416 Register last_java_sp, 1417 address entry_point, 1418 Register arg_1, 1419 Register arg_2, 1420 bool check_exceptions) { 1421 1422 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1423 pass_arg2(this, arg_2); 1424 pass_arg1(this, arg_1); 1425 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1426 } 1427 1428 void MacroAssembler::call_VM(Register oop_result, 1429 Register last_java_sp, 1430 address entry_point, 1431 Register arg_1, 1432 Register arg_2, 1433 Register arg_3, 1434 bool check_exceptions) { 1435 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1436 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1437 pass_arg3(this, arg_3); 1438 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1439 pass_arg2(this, arg_2); 1440 pass_arg1(this, arg_1); 1441 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1442 } 1443 1444 void MacroAssembler::super_call_VM(Register oop_result, 1445 Register last_java_sp, 1446 address entry_point, 1447 int number_of_arguments, 1448 bool check_exceptions) { 1449 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 1450 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 1451 } 1452 1453 void MacroAssembler::super_call_VM(Register oop_result, 1454 Register last_java_sp, 1455 address entry_point, 1456 Register arg_1, 1457 bool check_exceptions) { 1458 pass_arg1(this, arg_1); 1459 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 1460 } 1461 1462 void MacroAssembler::super_call_VM(Register oop_result, 1463 Register last_java_sp, 1464 address entry_point, 1465 Register arg_1, 1466 Register arg_2, 1467 bool check_exceptions) { 1468 1469 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1470 pass_arg2(this, arg_2); 1471 pass_arg1(this, arg_1); 1472 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 1473 } 1474 1475 void MacroAssembler::super_call_VM(Register oop_result, 1476 Register last_java_sp, 1477 address entry_point, 1478 Register arg_1, 1479 Register arg_2, 1480 Register arg_3, 1481 bool check_exceptions) { 1482 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1483 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1484 pass_arg3(this, arg_3); 1485 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1486 pass_arg2(this, arg_2); 1487 pass_arg1(this, arg_1); 1488 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 1489 } 1490 1491 void MacroAssembler::call_VM_base(Register oop_result, 1492 Register java_thread, 1493 Register last_java_sp, 1494 address entry_point, 1495 int number_of_arguments, 1496 bool check_exceptions) { 1497 // determine java_thread register 1498 if (!java_thread->is_valid()) { 1499 #ifdef _LP64 1500 java_thread = r15_thread; 1501 #else 1502 java_thread = rdi; 1503 get_thread(java_thread); 1504 #endif // LP64 1505 } 1506 // determine last_java_sp register 1507 if (!last_java_sp->is_valid()) { 1508 last_java_sp = rsp; 1509 } 1510 // debugging support 1511 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 1512 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); 1513 #ifdef ASSERT 1514 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 1515 // r12 is the heapbase. 1516 LP64_ONLY(if (UseCompressedOops && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) 1517 #endif // ASSERT 1518 1519 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 1520 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 1521 1522 // push java thread (becomes first argument of C function) 1523 1524 NOT_LP64(push(java_thread); number_of_arguments++); 1525 LP64_ONLY(mov(c_rarg0, r15_thread)); 1526 1527 // set last Java frame before call 1528 assert(last_java_sp != rbp, "can't use ebp/rbp"); 1529 1530 // Only interpreter should have to set fp 1531 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL); 1532 1533 // do the call, remove parameters 1534 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 1535 1536 // restore the thread (cannot use the pushed argument since arguments 1537 // may be overwritten by C code generated by an optimizing compiler); 1538 // however can use the register value directly if it is callee saved. 1539 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { 1540 // rdi & rsi (also r15) are callee saved -> nothing to do 1541 #ifdef ASSERT 1542 guarantee(java_thread != rax, "change this code"); 1543 push(rax); 1544 { Label L; 1545 get_thread(rax); 1546 cmpptr(java_thread, rax); 1547 jcc(Assembler::equal, L); 1548 STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); 1549 bind(L); 1550 } 1551 pop(rax); 1552 #endif 1553 } else { 1554 get_thread(java_thread); 1555 } 1556 // reset last Java frame 1557 // Only interpreter should have to clear fp 1558 reset_last_Java_frame(java_thread, true); 1559 1560 // C++ interp handles this in the interpreter 1561 check_and_handle_popframe(java_thread); 1562 check_and_handle_earlyret(java_thread); 1563 1564 if (check_exceptions) { 1565 // check for pending exceptions (java_thread is set upon return) 1566 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD); 1567 #ifndef _LP64 1568 jump_cc(Assembler::notEqual, 1569 RuntimeAddress(StubRoutines::forward_exception_entry())); 1570 #else 1571 // This used to conditionally jump to forward_exception however it is 1572 // possible if we relocate that the branch will not reach. So we must jump 1573 // around so we can always reach 1574 1575 Label ok; 1576 jcc(Assembler::equal, ok); 1577 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 1578 bind(ok); 1579 #endif // LP64 1580 } 1581 1582 // get oop result if there is one and reset the value in the thread 1583 if (oop_result->is_valid()) { 1584 get_vm_result(oop_result, java_thread); 1585 } 1586 } 1587 1588 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 1589 1590 // Calculate the value for last_Java_sp 1591 // somewhat subtle. call_VM does an intermediate call 1592 // which places a return address on the stack just under the 1593 // stack pointer as the user finsihed with it. This allows 1594 // use to retrieve last_Java_pc from last_Java_sp[-1]. 1595 // On 32bit we then have to push additional args on the stack to accomplish 1596 // the actual requested call. On 64bit call_VM only can use register args 1597 // so the only extra space is the return address that call_VM created. 1598 // This hopefully explains the calculations here. 1599 1600 #ifdef _LP64 1601 // We've pushed one address, correct last_Java_sp 1602 lea(rax, Address(rsp, wordSize)); 1603 #else 1604 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); 1605 #endif // LP64 1606 1607 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); 1608 1609 } 1610 1611 // Use this method when MacroAssembler version of call_VM_leaf_base() should be called from Interpreter. 1612 void MacroAssembler::call_VM_leaf0(address entry_point) { 1613 MacroAssembler::call_VM_leaf_base(entry_point, 0); 1614 } 1615 1616 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1617 call_VM_leaf_base(entry_point, number_of_arguments); 1618 } 1619 1620 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1621 pass_arg0(this, arg_0); 1622 call_VM_leaf(entry_point, 1); 1623 } 1624 1625 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1626 1627 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1628 pass_arg1(this, arg_1); 1629 pass_arg0(this, arg_0); 1630 call_VM_leaf(entry_point, 2); 1631 } 1632 1633 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1634 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1635 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1636 pass_arg2(this, arg_2); 1637 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1638 pass_arg1(this, arg_1); 1639 pass_arg0(this, arg_0); 1640 call_VM_leaf(entry_point, 3); 1641 } 1642 1643 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1644 pass_arg0(this, arg_0); 1645 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1646 } 1647 1648 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1649 1650 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1651 pass_arg1(this, arg_1); 1652 pass_arg0(this, arg_0); 1653 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1654 } 1655 1656 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1657 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1658 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1659 pass_arg2(this, arg_2); 1660 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1661 pass_arg1(this, arg_1); 1662 pass_arg0(this, arg_0); 1663 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1664 } 1665 1666 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1667 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg")); 1668 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 1669 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 1670 pass_arg3(this, arg_3); 1671 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 1672 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 1673 pass_arg2(this, arg_2); 1674 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 1675 pass_arg1(this, arg_1); 1676 pass_arg0(this, arg_0); 1677 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1678 } 1679 1680 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 1681 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 1682 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); 1683 verify_oop_msg(oop_result, "broken oop in call_VM_base"); 1684 } 1685 1686 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 1687 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 1688 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 1689 } 1690 1691 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { 1692 } 1693 1694 void MacroAssembler::check_and_handle_popframe(Register java_thread) { 1695 } 1696 1697 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) { 1698 if (reachable(src1)) { 1699 cmpl(as_Address(src1), imm); 1700 } else { 1701 lea(rscratch1, src1); 1702 cmpl(Address(rscratch1, 0), imm); 1703 } 1704 } 1705 1706 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) { 1707 assert(!src2.is_lval(), "use cmpptr"); 1708 if (reachable(src2)) { 1709 cmpl(src1, as_Address(src2)); 1710 } else { 1711 lea(rscratch1, src2); 1712 cmpl(src1, Address(rscratch1, 0)); 1713 } 1714 } 1715 1716 void MacroAssembler::cmp32(Register src1, int32_t imm) { 1717 Assembler::cmpl(src1, imm); 1718 } 1719 1720 void MacroAssembler::cmp32(Register src1, Address src2) { 1721 Assembler::cmpl(src1, src2); 1722 } 1723 1724 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1725 ucomisd(opr1, opr2); 1726 1727 Label L; 1728 if (unordered_is_less) { 1729 movl(dst, -1); 1730 jcc(Assembler::parity, L); 1731 jcc(Assembler::below , L); 1732 movl(dst, 0); 1733 jcc(Assembler::equal , L); 1734 increment(dst); 1735 } else { // unordered is greater 1736 movl(dst, 1); 1737 jcc(Assembler::parity, L); 1738 jcc(Assembler::above , L); 1739 movl(dst, 0); 1740 jcc(Assembler::equal , L); 1741 decrementl(dst); 1742 } 1743 bind(L); 1744 } 1745 1746 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 1747 ucomiss(opr1, opr2); 1748 1749 Label L; 1750 if (unordered_is_less) { 1751 movl(dst, -1); 1752 jcc(Assembler::parity, L); 1753 jcc(Assembler::below , L); 1754 movl(dst, 0); 1755 jcc(Assembler::equal , L); 1756 increment(dst); 1757 } else { // unordered is greater 1758 movl(dst, 1); 1759 jcc(Assembler::parity, L); 1760 jcc(Assembler::above , L); 1761 movl(dst, 0); 1762 jcc(Assembler::equal , L); 1763 decrementl(dst); 1764 } 1765 bind(L); 1766 } 1767 1768 1769 void MacroAssembler::cmp8(AddressLiteral src1, int imm) { 1770 if (reachable(src1)) { 1771 cmpb(as_Address(src1), imm); 1772 } else { 1773 lea(rscratch1, src1); 1774 cmpb(Address(rscratch1, 0), imm); 1775 } 1776 } 1777 1778 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) { 1779 #ifdef _LP64 1780 if (src2.is_lval()) { 1781 movptr(rscratch1, src2); 1782 Assembler::cmpq(src1, rscratch1); 1783 } else if (reachable(src2)) { 1784 cmpq(src1, as_Address(src2)); 1785 } else { 1786 lea(rscratch1, src2); 1787 Assembler::cmpq(src1, Address(rscratch1, 0)); 1788 } 1789 #else 1790 if (src2.is_lval()) { 1791 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 1792 } else { 1793 cmpl(src1, as_Address(src2)); 1794 } 1795 #endif // _LP64 1796 } 1797 1798 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) { 1799 assert(src2.is_lval(), "not a mem-mem compare"); 1800 #ifdef _LP64 1801 // moves src2's literal address 1802 movptr(rscratch1, src2); 1803 Assembler::cmpq(src1, rscratch1); 1804 #else 1805 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 1806 #endif // _LP64 1807 } 1808 1809 void MacroAssembler::cmpoop(Register src1, Register src2) { 1810 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1811 bs->obj_equals(this, src1, src2); 1812 } 1813 1814 void MacroAssembler::cmpoop(Register src1, Address src2) { 1815 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1816 bs->obj_equals(this, src1, src2); 1817 } 1818 1819 #ifdef _LP64 1820 void MacroAssembler::cmpoop(Register src1, jobject src2) { 1821 movoop(rscratch1, src2); 1822 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1823 bs->obj_equals(this, src1, rscratch1); 1824 } 1825 #endif 1826 1827 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) { 1828 if (reachable(adr)) { 1829 lock(); 1830 cmpxchgptr(reg, as_Address(adr)); 1831 } else { 1832 lea(rscratch1, adr); 1833 lock(); 1834 cmpxchgptr(reg, Address(rscratch1, 0)); 1835 } 1836 } 1837 1838 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 1839 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 1840 } 1841 1842 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) { 1843 if (reachable(src)) { 1844 Assembler::comisd(dst, as_Address(src)); 1845 } else { 1846 lea(rscratch1, src); 1847 Assembler::comisd(dst, Address(rscratch1, 0)); 1848 } 1849 } 1850 1851 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) { 1852 if (reachable(src)) { 1853 Assembler::comiss(dst, as_Address(src)); 1854 } else { 1855 lea(rscratch1, src); 1856 Assembler::comiss(dst, Address(rscratch1, 0)); 1857 } 1858 } 1859 1860 1861 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) { 1862 Condition negated_cond = negate_condition(cond); 1863 Label L; 1864 jcc(negated_cond, L); 1865 pushf(); // Preserve flags 1866 atomic_incl(counter_addr); 1867 popf(); 1868 bind(L); 1869 } 1870 1871 int MacroAssembler::corrected_idivl(Register reg) { 1872 // Full implementation of Java idiv and irem; checks for 1873 // special case as described in JVM spec., p.243 & p.271. 1874 // The function returns the (pc) offset of the idivl 1875 // instruction - may be needed for implicit exceptions. 1876 // 1877 // normal case special case 1878 // 1879 // input : rax,: dividend min_int 1880 // reg: divisor (may not be rax,/rdx) -1 1881 // 1882 // output: rax,: quotient (= rax, idiv reg) min_int 1883 // rdx: remainder (= rax, irem reg) 0 1884 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 1885 const int min_int = 0x80000000; 1886 Label normal_case, special_case; 1887 1888 // check for special case 1889 cmpl(rax, min_int); 1890 jcc(Assembler::notEqual, normal_case); 1891 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 1892 cmpl(reg, -1); 1893 jcc(Assembler::equal, special_case); 1894 1895 // handle normal case 1896 bind(normal_case); 1897 cdql(); 1898 int idivl_offset = offset(); 1899 idivl(reg); 1900 1901 // normal and special case exit 1902 bind(special_case); 1903 1904 return idivl_offset; 1905 } 1906 1907 1908 1909 void MacroAssembler::decrementl(Register reg, int value) { 1910 if (value == min_jint) {subl(reg, value) ; return; } 1911 if (value < 0) { incrementl(reg, -value); return; } 1912 if (value == 0) { ; return; } 1913 if (value == 1 && UseIncDec) { decl(reg) ; return; } 1914 /* else */ { subl(reg, value) ; return; } 1915 } 1916 1917 void MacroAssembler::decrementl(Address dst, int value) { 1918 if (value == min_jint) {subl(dst, value) ; return; } 1919 if (value < 0) { incrementl(dst, -value); return; } 1920 if (value == 0) { ; return; } 1921 if (value == 1 && UseIncDec) { decl(dst) ; return; } 1922 /* else */ { subl(dst, value) ; return; } 1923 } 1924 1925 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 1926 assert (shift_value > 0, "illegal shift value"); 1927 Label _is_positive; 1928 testl (reg, reg); 1929 jcc (Assembler::positive, _is_positive); 1930 int offset = (1 << shift_value) - 1 ; 1931 1932 if (offset == 1) { 1933 incrementl(reg); 1934 } else { 1935 addl(reg, offset); 1936 } 1937 1938 bind (_is_positive); 1939 sarl(reg, shift_value); 1940 } 1941 1942 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) { 1943 if (reachable(src)) { 1944 Assembler::divsd(dst, as_Address(src)); 1945 } else { 1946 lea(rscratch1, src); 1947 Assembler::divsd(dst, Address(rscratch1, 0)); 1948 } 1949 } 1950 1951 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) { 1952 if (reachable(src)) { 1953 Assembler::divss(dst, as_Address(src)); 1954 } else { 1955 lea(rscratch1, src); 1956 Assembler::divss(dst, Address(rscratch1, 0)); 1957 } 1958 } 1959 1960 void MacroAssembler::enter() { 1961 push(rbp); 1962 mov(rbp, rsp); 1963 } 1964 1965 // A 5 byte nop that is safe for patching (see patch_verified_entry) 1966 void MacroAssembler::fat_nop() { 1967 if (UseAddressNop) { 1968 addr_nop_5(); 1969 } else { 1970 emit_int8(0x26); // es: 1971 emit_int8(0x2e); // cs: 1972 emit_int8(0x64); // fs: 1973 emit_int8(0x65); // gs: 1974 emit_int8((unsigned char)0x90); 1975 } 1976 } 1977 1978 #ifndef _LP64 1979 void MacroAssembler::fcmp(Register tmp) { 1980 fcmp(tmp, 1, true, true); 1981 } 1982 1983 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 1984 assert(!pop_right || pop_left, "usage error"); 1985 if (VM_Version::supports_cmov()) { 1986 assert(tmp == noreg, "unneeded temp"); 1987 if (pop_left) { 1988 fucomip(index); 1989 } else { 1990 fucomi(index); 1991 } 1992 if (pop_right) { 1993 fpop(); 1994 } 1995 } else { 1996 assert(tmp != noreg, "need temp"); 1997 if (pop_left) { 1998 if (pop_right) { 1999 fcompp(); 2000 } else { 2001 fcomp(index); 2002 } 2003 } else { 2004 fcom(index); 2005 } 2006 // convert FPU condition into eflags condition via rax, 2007 save_rax(tmp); 2008 fwait(); fnstsw_ax(); 2009 sahf(); 2010 restore_rax(tmp); 2011 } 2012 // condition codes set as follows: 2013 // 2014 // CF (corresponds to C0) if x < y 2015 // PF (corresponds to C2) if unordered 2016 // ZF (corresponds to C3) if x = y 2017 } 2018 2019 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2020 fcmp2int(dst, unordered_is_less, 1, true, true); 2021 } 2022 2023 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2024 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2025 Label L; 2026 if (unordered_is_less) { 2027 movl(dst, -1); 2028 jcc(Assembler::parity, L); 2029 jcc(Assembler::below , L); 2030 movl(dst, 0); 2031 jcc(Assembler::equal , L); 2032 increment(dst); 2033 } else { // unordered is greater 2034 movl(dst, 1); 2035 jcc(Assembler::parity, L); 2036 jcc(Assembler::above , L); 2037 movl(dst, 0); 2038 jcc(Assembler::equal , L); 2039 decrementl(dst); 2040 } 2041 bind(L); 2042 } 2043 2044 void MacroAssembler::fld_d(AddressLiteral src) { 2045 fld_d(as_Address(src)); 2046 } 2047 2048 void MacroAssembler::fld_s(AddressLiteral src) { 2049 fld_s(as_Address(src)); 2050 } 2051 2052 void MacroAssembler::fld_x(AddressLiteral src) { 2053 Assembler::fld_x(as_Address(src)); 2054 } 2055 2056 void MacroAssembler::fldcw(AddressLiteral src) { 2057 Assembler::fldcw(as_Address(src)); 2058 } 2059 2060 void MacroAssembler::fpop() { 2061 ffree(); 2062 fincstp(); 2063 } 2064 2065 void MacroAssembler::fremr(Register tmp) { 2066 save_rax(tmp); 2067 { Label L; 2068 bind(L); 2069 fprem(); 2070 fwait(); fnstsw_ax(); 2071 sahf(); 2072 jcc(Assembler::parity, L); 2073 } 2074 restore_rax(tmp); 2075 // Result is in ST0. 2076 // Note: fxch & fpop to get rid of ST1 2077 // (otherwise FPU stack could overflow eventually) 2078 fxch(1); 2079 fpop(); 2080 } 2081 2082 void MacroAssembler::empty_FPU_stack() { 2083 if (VM_Version::supports_mmx()) { 2084 emms(); 2085 } else { 2086 for (int i = 8; i-- > 0; ) ffree(i); 2087 } 2088 } 2089 #endif // !LP64 2090 2091 void MacroAssembler::mulpd(XMMRegister dst, AddressLiteral src) { 2092 if (reachable(src)) { 2093 Assembler::mulpd(dst, as_Address(src)); 2094 } else { 2095 lea(rscratch1, src); 2096 Assembler::mulpd(dst, Address(rscratch1, 0)); 2097 } 2098 } 2099 2100 void MacroAssembler::load_float(Address src) { 2101 #ifdef _LP64 2102 movflt(xmm0, src); 2103 #else 2104 if (UseSSE >= 1) { 2105 movflt(xmm0, src); 2106 } else { 2107 fld_s(src); 2108 } 2109 #endif // LP64 2110 } 2111 2112 void MacroAssembler::store_float(Address dst) { 2113 #ifdef _LP64 2114 movflt(dst, xmm0); 2115 #else 2116 if (UseSSE >= 1) { 2117 movflt(dst, xmm0); 2118 } else { 2119 fstp_s(dst); 2120 } 2121 #endif // LP64 2122 } 2123 2124 void MacroAssembler::load_double(Address src) { 2125 #ifdef _LP64 2126 movdbl(xmm0, src); 2127 #else 2128 if (UseSSE >= 2) { 2129 movdbl(xmm0, src); 2130 } else { 2131 fld_d(src); 2132 } 2133 #endif // LP64 2134 } 2135 2136 void MacroAssembler::store_double(Address dst) { 2137 #ifdef _LP64 2138 movdbl(dst, xmm0); 2139 #else 2140 if (UseSSE >= 2) { 2141 movdbl(dst, xmm0); 2142 } else { 2143 fstp_d(dst); 2144 } 2145 #endif // LP64 2146 } 2147 2148 // dst = c = a * b + c 2149 void MacroAssembler::fmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2150 Assembler::vfmadd231sd(c, a, b); 2151 if (dst != c) { 2152 movdbl(dst, c); 2153 } 2154 } 2155 2156 // dst = c = a * b + c 2157 void MacroAssembler::fmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c) { 2158 Assembler::vfmadd231ss(c, a, b); 2159 if (dst != c) { 2160 movflt(dst, c); 2161 } 2162 } 2163 2164 // dst = c = a * b + c 2165 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2166 Assembler::vfmadd231pd(c, a, b, vector_len); 2167 if (dst != c) { 2168 vmovdqu(dst, c); 2169 } 2170 } 2171 2172 // dst = c = a * b + c 2173 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, XMMRegister b, XMMRegister c, int vector_len) { 2174 Assembler::vfmadd231ps(c, a, b, vector_len); 2175 if (dst != c) { 2176 vmovdqu(dst, c); 2177 } 2178 } 2179 2180 // dst = c = a * b + c 2181 void MacroAssembler::vfmad(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2182 Assembler::vfmadd231pd(c, a, b, vector_len); 2183 if (dst != c) { 2184 vmovdqu(dst, c); 2185 } 2186 } 2187 2188 // dst = c = a * b + c 2189 void MacroAssembler::vfmaf(XMMRegister dst, XMMRegister a, Address b, XMMRegister c, int vector_len) { 2190 Assembler::vfmadd231ps(c, a, b, vector_len); 2191 if (dst != c) { 2192 vmovdqu(dst, c); 2193 } 2194 } 2195 2196 void MacroAssembler::incrementl(AddressLiteral dst) { 2197 if (reachable(dst)) { 2198 incrementl(as_Address(dst)); 2199 } else { 2200 lea(rscratch1, dst); 2201 incrementl(Address(rscratch1, 0)); 2202 } 2203 } 2204 2205 void MacroAssembler::incrementl(ArrayAddress dst) { 2206 incrementl(as_Address(dst)); 2207 } 2208 2209 void MacroAssembler::incrementl(Register reg, int value) { 2210 if (value == min_jint) {addl(reg, value) ; return; } 2211 if (value < 0) { decrementl(reg, -value); return; } 2212 if (value == 0) { ; return; } 2213 if (value == 1 && UseIncDec) { incl(reg) ; return; } 2214 /* else */ { addl(reg, value) ; return; } 2215 } 2216 2217 void MacroAssembler::incrementl(Address dst, int value) { 2218 if (value == min_jint) {addl(dst, value) ; return; } 2219 if (value < 0) { decrementl(dst, -value); return; } 2220 if (value == 0) { ; return; } 2221 if (value == 1 && UseIncDec) { incl(dst) ; return; } 2222 /* else */ { addl(dst, value) ; return; } 2223 } 2224 2225 void MacroAssembler::jump(AddressLiteral dst) { 2226 if (reachable(dst)) { 2227 jmp_literal(dst.target(), dst.rspec()); 2228 } else { 2229 lea(rscratch1, dst); 2230 jmp(rscratch1); 2231 } 2232 } 2233 2234 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) { 2235 if (reachable(dst)) { 2236 InstructionMark im(this); 2237 relocate(dst.reloc()); 2238 const int short_size = 2; 2239 const int long_size = 6; 2240 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 2241 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 2242 // 0111 tttn #8-bit disp 2243 emit_int8(0x70 | cc); 2244 emit_int8((offs - short_size) & 0xFF); 2245 } else { 2246 // 0000 1111 1000 tttn #32-bit disp 2247 emit_int8(0x0F); 2248 emit_int8((unsigned char)(0x80 | cc)); 2249 emit_int32(offs - long_size); 2250 } 2251 } else { 2252 #ifdef ASSERT 2253 warning("reversing conditional branch"); 2254 #endif /* ASSERT */ 2255 Label skip; 2256 jccb(reverse[cc], skip); 2257 lea(rscratch1, dst); 2258 Assembler::jmp(rscratch1); 2259 bind(skip); 2260 } 2261 } 2262 2263 void MacroAssembler::ldmxcsr(AddressLiteral src) { 2264 if (reachable(src)) { 2265 Assembler::ldmxcsr(as_Address(src)); 2266 } else { 2267 lea(rscratch1, src); 2268 Assembler::ldmxcsr(Address(rscratch1, 0)); 2269 } 2270 } 2271 2272 int MacroAssembler::load_signed_byte(Register dst, Address src) { 2273 int off; 2274 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2275 off = offset(); 2276 movsbl(dst, src); // movsxb 2277 } else { 2278 off = load_unsigned_byte(dst, src); 2279 shll(dst, 24); 2280 sarl(dst, 24); 2281 } 2282 return off; 2283 } 2284 2285 // Note: load_signed_short used to be called load_signed_word. 2286 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 2287 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 2288 // The term "word" in HotSpot means a 32- or 64-bit machine word. 2289 int MacroAssembler::load_signed_short(Register dst, Address src) { 2290 int off; 2291 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2292 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 2293 // version but this is what 64bit has always done. This seems to imply 2294 // that users are only using 32bits worth. 2295 off = offset(); 2296 movswl(dst, src); // movsxw 2297 } else { 2298 off = load_unsigned_short(dst, src); 2299 shll(dst, 16); 2300 sarl(dst, 16); 2301 } 2302 return off; 2303 } 2304 2305 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 2306 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2307 // and "3.9 Partial Register Penalties", p. 22). 2308 int off; 2309 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 2310 off = offset(); 2311 movzbl(dst, src); // movzxb 2312 } else { 2313 xorl(dst, dst); 2314 off = offset(); 2315 movb(dst, src); 2316 } 2317 return off; 2318 } 2319 2320 // Note: load_unsigned_short used to be called load_unsigned_word. 2321 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 2322 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 2323 // and "3.9 Partial Register Penalties", p. 22). 2324 int off; 2325 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 2326 off = offset(); 2327 movzwl(dst, src); // movzxw 2328 } else { 2329 xorl(dst, dst); 2330 off = offset(); 2331 movw(dst, src); 2332 } 2333 return off; 2334 } 2335 2336 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 2337 switch (size_in_bytes) { 2338 #ifndef _LP64 2339 case 8: 2340 assert(dst2 != noreg, "second dest register required"); 2341 movl(dst, src); 2342 movl(dst2, src.plus_disp(BytesPerInt)); 2343 break; 2344 #else 2345 case 8: movq(dst, src); break; 2346 #endif 2347 case 4: movl(dst, src); break; 2348 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 2349 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 2350 default: ShouldNotReachHere(); 2351 } 2352 } 2353 2354 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 2355 switch (size_in_bytes) { 2356 #ifndef _LP64 2357 case 8: 2358 assert(src2 != noreg, "second source register required"); 2359 movl(dst, src); 2360 movl(dst.plus_disp(BytesPerInt), src2); 2361 break; 2362 #else 2363 case 8: movq(dst, src); break; 2364 #endif 2365 case 4: movl(dst, src); break; 2366 case 2: movw(dst, src); break; 2367 case 1: movb(dst, src); break; 2368 default: ShouldNotReachHere(); 2369 } 2370 } 2371 2372 void MacroAssembler::mov32(AddressLiteral dst, Register src) { 2373 if (reachable(dst)) { 2374 movl(as_Address(dst), src); 2375 } else { 2376 lea(rscratch1, dst); 2377 movl(Address(rscratch1, 0), src); 2378 } 2379 } 2380 2381 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 2382 if (reachable(src)) { 2383 movl(dst, as_Address(src)); 2384 } else { 2385 lea(rscratch1, src); 2386 movl(dst, Address(rscratch1, 0)); 2387 } 2388 } 2389 2390 // C++ bool manipulation 2391 2392 void MacroAssembler::movbool(Register dst, Address src) { 2393 if(sizeof(bool) == 1) 2394 movb(dst, src); 2395 else if(sizeof(bool) == 2) 2396 movw(dst, src); 2397 else if(sizeof(bool) == 4) 2398 movl(dst, src); 2399 else 2400 // unsupported 2401 ShouldNotReachHere(); 2402 } 2403 2404 void MacroAssembler::movbool(Address dst, bool boolconst) { 2405 if(sizeof(bool) == 1) 2406 movb(dst, (int) boolconst); 2407 else if(sizeof(bool) == 2) 2408 movw(dst, (int) boolconst); 2409 else if(sizeof(bool) == 4) 2410 movl(dst, (int) boolconst); 2411 else 2412 // unsupported 2413 ShouldNotReachHere(); 2414 } 2415 2416 void MacroAssembler::movbool(Address dst, Register src) { 2417 if(sizeof(bool) == 1) 2418 movb(dst, src); 2419 else if(sizeof(bool) == 2) 2420 movw(dst, src); 2421 else if(sizeof(bool) == 4) 2422 movl(dst, src); 2423 else 2424 // unsupported 2425 ShouldNotReachHere(); 2426 } 2427 2428 void MacroAssembler::movbyte(ArrayAddress dst, int src) { 2429 movb(as_Address(dst), src); 2430 } 2431 2432 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) { 2433 if (reachable(src)) { 2434 movdl(dst, as_Address(src)); 2435 } else { 2436 lea(rscratch1, src); 2437 movdl(dst, Address(rscratch1, 0)); 2438 } 2439 } 2440 2441 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { 2442 if (reachable(src)) { 2443 movq(dst, as_Address(src)); 2444 } else { 2445 lea(rscratch1, src); 2446 movq(dst, Address(rscratch1, 0)); 2447 } 2448 } 2449 2450 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) { 2451 if (reachable(src)) { 2452 if (UseXmmLoadAndClearUpper) { 2453 movsd (dst, as_Address(src)); 2454 } else { 2455 movlpd(dst, as_Address(src)); 2456 } 2457 } else { 2458 lea(rscratch1, src); 2459 if (UseXmmLoadAndClearUpper) { 2460 movsd (dst, Address(rscratch1, 0)); 2461 } else { 2462 movlpd(dst, Address(rscratch1, 0)); 2463 } 2464 } 2465 } 2466 2467 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) { 2468 if (reachable(src)) { 2469 movss(dst, as_Address(src)); 2470 } else { 2471 lea(rscratch1, src); 2472 movss(dst, Address(rscratch1, 0)); 2473 } 2474 } 2475 2476 void MacroAssembler::movptr(Register dst, Register src) { 2477 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2478 } 2479 2480 void MacroAssembler::movptr(Register dst, Address src) { 2481 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2482 } 2483 2484 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 2485 void MacroAssembler::movptr(Register dst, intptr_t src) { 2486 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src)); 2487 } 2488 2489 void MacroAssembler::movptr(Address dst, Register src) { 2490 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 2491 } 2492 2493 void MacroAssembler::movdqu(Address dst, XMMRegister src) { 2494 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2495 Assembler::movdqu(dst, src); 2496 } 2497 2498 void MacroAssembler::movdqu(XMMRegister dst, Address src) { 2499 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2500 Assembler::movdqu(dst, src); 2501 } 2502 2503 void MacroAssembler::movdqu(XMMRegister dst, XMMRegister src) { 2504 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2505 Assembler::movdqu(dst, src); 2506 } 2507 2508 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src, Register scratchReg) { 2509 if (reachable(src)) { 2510 movdqu(dst, as_Address(src)); 2511 } else { 2512 lea(scratchReg, src); 2513 movdqu(dst, Address(scratchReg, 0)); 2514 } 2515 } 2516 2517 void MacroAssembler::vmovdqu(Address dst, XMMRegister src) { 2518 assert(((src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2519 Assembler::vmovdqu(dst, src); 2520 } 2521 2522 void MacroAssembler::vmovdqu(XMMRegister dst, Address src) { 2523 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2524 Assembler::vmovdqu(dst, src); 2525 } 2526 2527 void MacroAssembler::vmovdqu(XMMRegister dst, XMMRegister src) { 2528 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 2529 Assembler::vmovdqu(dst, src); 2530 } 2531 2532 void MacroAssembler::vmovdqu(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 2533 if (reachable(src)) { 2534 vmovdqu(dst, as_Address(src)); 2535 } 2536 else { 2537 lea(scratch_reg, src); 2538 vmovdqu(dst, Address(scratch_reg, 0)); 2539 } 2540 } 2541 2542 void MacroAssembler::evmovdquq(XMMRegister dst, AddressLiteral src, int vector_len, Register rscratch) { 2543 if (reachable(src)) { 2544 Assembler::evmovdquq(dst, as_Address(src), vector_len); 2545 } else { 2546 lea(rscratch, src); 2547 Assembler::evmovdquq(dst, Address(rscratch, 0), vector_len); 2548 } 2549 } 2550 2551 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { 2552 if (reachable(src)) { 2553 Assembler::movdqa(dst, as_Address(src)); 2554 } else { 2555 lea(rscratch1, src); 2556 Assembler::movdqa(dst, Address(rscratch1, 0)); 2557 } 2558 } 2559 2560 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { 2561 if (reachable(src)) { 2562 Assembler::movsd(dst, as_Address(src)); 2563 } else { 2564 lea(rscratch1, src); 2565 Assembler::movsd(dst, Address(rscratch1, 0)); 2566 } 2567 } 2568 2569 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) { 2570 if (reachable(src)) { 2571 Assembler::movss(dst, as_Address(src)); 2572 } else { 2573 lea(rscratch1, src); 2574 Assembler::movss(dst, Address(rscratch1, 0)); 2575 } 2576 } 2577 2578 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) { 2579 if (reachable(src)) { 2580 Assembler::mulsd(dst, as_Address(src)); 2581 } else { 2582 lea(rscratch1, src); 2583 Assembler::mulsd(dst, Address(rscratch1, 0)); 2584 } 2585 } 2586 2587 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) { 2588 if (reachable(src)) { 2589 Assembler::mulss(dst, as_Address(src)); 2590 } else { 2591 lea(rscratch1, src); 2592 Assembler::mulss(dst, Address(rscratch1, 0)); 2593 } 2594 } 2595 2596 void MacroAssembler::null_check(Register reg, int offset) { 2597 if (needs_explicit_null_check(offset)) { 2598 // provoke OS NULL exception if reg = NULL by 2599 // accessing M[reg] w/o changing any (non-CC) registers 2600 // NOTE: cmpl is plenty here to provoke a segv 2601 cmpptr(rax, Address(reg, 0)); 2602 // Note: should probably use testl(rax, Address(reg, 0)); 2603 // may be shorter code (however, this version of 2604 // testl needs to be implemented first) 2605 } else { 2606 // nothing to do, (later) access of M[reg + offset] 2607 // will provoke OS NULL exception if reg = NULL 2608 } 2609 } 2610 2611 void MacroAssembler::os_breakpoint() { 2612 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 2613 // (e.g., MSVC can't call ps() otherwise) 2614 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 2615 } 2616 2617 void MacroAssembler::unimplemented(const char* what) { 2618 const char* buf = NULL; 2619 { 2620 ResourceMark rm; 2621 stringStream ss; 2622 ss.print("unimplemented: %s", what); 2623 buf = code_string(ss.as_string()); 2624 } 2625 stop(buf); 2626 } 2627 2628 #ifdef _LP64 2629 #define XSTATE_BV 0x200 2630 #endif 2631 2632 void MacroAssembler::pop_CPU_state() { 2633 pop_FPU_state(); 2634 pop_IU_state(); 2635 } 2636 2637 void MacroAssembler::pop_FPU_state() { 2638 #ifndef _LP64 2639 frstor(Address(rsp, 0)); 2640 #else 2641 fxrstor(Address(rsp, 0)); 2642 #endif 2643 addptr(rsp, FPUStateSizeInWords * wordSize); 2644 } 2645 2646 void MacroAssembler::pop_IU_state() { 2647 popa(); 2648 LP64_ONLY(addq(rsp, 8)); 2649 popf(); 2650 } 2651 2652 // Save Integer and Float state 2653 // Warning: Stack must be 16 byte aligned (64bit) 2654 void MacroAssembler::push_CPU_state() { 2655 push_IU_state(); 2656 push_FPU_state(); 2657 } 2658 2659 void MacroAssembler::push_FPU_state() { 2660 subptr(rsp, FPUStateSizeInWords * wordSize); 2661 #ifndef _LP64 2662 fnsave(Address(rsp, 0)); 2663 fwait(); 2664 #else 2665 fxsave(Address(rsp, 0)); 2666 #endif // LP64 2667 } 2668 2669 void MacroAssembler::push_IU_state() { 2670 // Push flags first because pusha kills them 2671 pushf(); 2672 // Make sure rsp stays 16-byte aligned 2673 LP64_ONLY(subq(rsp, 8)); 2674 pusha(); 2675 } 2676 2677 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp) { // determine java_thread register 2678 if (!java_thread->is_valid()) { 2679 java_thread = rdi; 2680 get_thread(java_thread); 2681 } 2682 // we must set sp to zero to clear frame 2683 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 2684 if (clear_fp) { 2685 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 2686 } 2687 2688 // Always clear the pc because it could have been set by make_walkable() 2689 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 2690 2691 vzeroupper(); 2692 } 2693 2694 void MacroAssembler::restore_rax(Register tmp) { 2695 if (tmp == noreg) pop(rax); 2696 else if (tmp != rax) mov(rax, tmp); 2697 } 2698 2699 void MacroAssembler::round_to(Register reg, int modulus) { 2700 addptr(reg, modulus - 1); 2701 andptr(reg, -modulus); 2702 } 2703 2704 void MacroAssembler::save_rax(Register tmp) { 2705 if (tmp == noreg) push(rax); 2706 else if (tmp != rax) mov(tmp, rax); 2707 } 2708 2709 void MacroAssembler::safepoint_poll(Label& slow_path, Register thread_reg, Register temp_reg) { 2710 #ifdef _LP64 2711 assert(thread_reg == r15_thread, "should be"); 2712 #else 2713 if (thread_reg == noreg) { 2714 thread_reg = temp_reg; 2715 get_thread(thread_reg); 2716 } 2717 #endif 2718 testb(Address(thread_reg, Thread::polling_page_offset()), SafepointMechanism::poll_bit()); 2719 jcc(Assembler::notZero, slow_path); // handshake bit set implies poll 2720 } 2721 2722 // Calls to C land 2723 // 2724 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 2725 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 2726 // has to be reset to 0. This is required to allow proper stack traversal. 2727 void MacroAssembler::set_last_Java_frame(Register java_thread, 2728 Register last_java_sp, 2729 Register last_java_fp, 2730 address last_java_pc) { 2731 vzeroupper(); 2732 // determine java_thread register 2733 if (!java_thread->is_valid()) { 2734 java_thread = rdi; 2735 get_thread(java_thread); 2736 } 2737 // determine last_java_sp register 2738 if (!last_java_sp->is_valid()) { 2739 last_java_sp = rsp; 2740 } 2741 2742 // last_java_fp is optional 2743 2744 if (last_java_fp->is_valid()) { 2745 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 2746 } 2747 2748 // last_java_pc is optional 2749 2750 if (last_java_pc != NULL) { 2751 lea(Address(java_thread, 2752 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()), 2753 InternalAddress(last_java_pc)); 2754 2755 } 2756 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 2757 } 2758 2759 void MacroAssembler::shlptr(Register dst, int imm8) { 2760 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 2761 } 2762 2763 void MacroAssembler::shrptr(Register dst, int imm8) { 2764 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 2765 } 2766 2767 void MacroAssembler::sign_extend_byte(Register reg) { 2768 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 2769 movsbl(reg, reg); // movsxb 2770 } else { 2771 shll(reg, 24); 2772 sarl(reg, 24); 2773 } 2774 } 2775 2776 void MacroAssembler::sign_extend_short(Register reg) { 2777 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 2778 movswl(reg, reg); // movsxw 2779 } else { 2780 shll(reg, 16); 2781 sarl(reg, 16); 2782 } 2783 } 2784 2785 void MacroAssembler::testl(Register dst, AddressLiteral src) { 2786 assert(reachable(src), "Address should be reachable"); 2787 testl(dst, as_Address(src)); 2788 } 2789 2790 void MacroAssembler::pcmpeqb(XMMRegister dst, XMMRegister src) { 2791 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2792 Assembler::pcmpeqb(dst, src); 2793 } 2794 2795 void MacroAssembler::pcmpeqw(XMMRegister dst, XMMRegister src) { 2796 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2797 Assembler::pcmpeqw(dst, src); 2798 } 2799 2800 void MacroAssembler::pcmpestri(XMMRegister dst, Address src, int imm8) { 2801 assert((dst->encoding() < 16),"XMM register should be 0-15"); 2802 Assembler::pcmpestri(dst, src, imm8); 2803 } 2804 2805 void MacroAssembler::pcmpestri(XMMRegister dst, XMMRegister src, int imm8) { 2806 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 2807 Assembler::pcmpestri(dst, src, imm8); 2808 } 2809 2810 void MacroAssembler::pmovzxbw(XMMRegister dst, XMMRegister src) { 2811 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2812 Assembler::pmovzxbw(dst, src); 2813 } 2814 2815 void MacroAssembler::pmovzxbw(XMMRegister dst, Address src) { 2816 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2817 Assembler::pmovzxbw(dst, src); 2818 } 2819 2820 void MacroAssembler::pmovmskb(Register dst, XMMRegister src) { 2821 assert((src->encoding() < 16),"XMM register should be 0-15"); 2822 Assembler::pmovmskb(dst, src); 2823 } 2824 2825 void MacroAssembler::ptest(XMMRegister dst, XMMRegister src) { 2826 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 2827 Assembler::ptest(dst, src); 2828 } 2829 2830 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) { 2831 if (reachable(src)) { 2832 Assembler::sqrtsd(dst, as_Address(src)); 2833 } else { 2834 lea(rscratch1, src); 2835 Assembler::sqrtsd(dst, Address(rscratch1, 0)); 2836 } 2837 } 2838 2839 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) { 2840 if (reachable(src)) { 2841 Assembler::sqrtss(dst, as_Address(src)); 2842 } else { 2843 lea(rscratch1, src); 2844 Assembler::sqrtss(dst, Address(rscratch1, 0)); 2845 } 2846 } 2847 2848 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) { 2849 if (reachable(src)) { 2850 Assembler::subsd(dst, as_Address(src)); 2851 } else { 2852 lea(rscratch1, src); 2853 Assembler::subsd(dst, Address(rscratch1, 0)); 2854 } 2855 } 2856 2857 void MacroAssembler::roundsd(XMMRegister dst, AddressLiteral src, int32_t rmode, Register scratch_reg) { 2858 if (reachable(src)) { 2859 Assembler::roundsd(dst, as_Address(src), rmode); 2860 } else { 2861 lea(scratch_reg, src); 2862 Assembler::roundsd(dst, Address(scratch_reg, 0), rmode); 2863 } 2864 } 2865 2866 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) { 2867 if (reachable(src)) { 2868 Assembler::subss(dst, as_Address(src)); 2869 } else { 2870 lea(rscratch1, src); 2871 Assembler::subss(dst, Address(rscratch1, 0)); 2872 } 2873 } 2874 2875 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) { 2876 if (reachable(src)) { 2877 Assembler::ucomisd(dst, as_Address(src)); 2878 } else { 2879 lea(rscratch1, src); 2880 Assembler::ucomisd(dst, Address(rscratch1, 0)); 2881 } 2882 } 2883 2884 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) { 2885 if (reachable(src)) { 2886 Assembler::ucomiss(dst, as_Address(src)); 2887 } else { 2888 lea(rscratch1, src); 2889 Assembler::ucomiss(dst, Address(rscratch1, 0)); 2890 } 2891 } 2892 2893 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 2894 // Used in sign-bit flipping with aligned address. 2895 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 2896 if (reachable(src)) { 2897 Assembler::xorpd(dst, as_Address(src)); 2898 } else { 2899 lea(scratch_reg, src); 2900 Assembler::xorpd(dst, Address(scratch_reg, 0)); 2901 } 2902 } 2903 2904 void MacroAssembler::xorpd(XMMRegister dst, XMMRegister src) { 2905 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 2906 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 2907 } 2908 else { 2909 Assembler::xorpd(dst, src); 2910 } 2911 } 2912 2913 void MacroAssembler::xorps(XMMRegister dst, XMMRegister src) { 2914 if (UseAVX > 2 && !VM_Version::supports_avx512dq() && (dst->encoding() == src->encoding())) { 2915 Assembler::vpxor(dst, dst, src, Assembler::AVX_512bit); 2916 } else { 2917 Assembler::xorps(dst, src); 2918 } 2919 } 2920 2921 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src, Register scratch_reg) { 2922 // Used in sign-bit flipping with aligned address. 2923 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 2924 if (reachable(src)) { 2925 Assembler::xorps(dst, as_Address(src)); 2926 } else { 2927 lea(scratch_reg, src); 2928 Assembler::xorps(dst, Address(scratch_reg, 0)); 2929 } 2930 } 2931 2932 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { 2933 // Used in sign-bit flipping with aligned address. 2934 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 2935 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 2936 if (reachable(src)) { 2937 Assembler::pshufb(dst, as_Address(src)); 2938 } else { 2939 lea(rscratch1, src); 2940 Assembler::pshufb(dst, Address(rscratch1, 0)); 2941 } 2942 } 2943 2944 // AVX 3-operands instructions 2945 2946 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 2947 if (reachable(src)) { 2948 vaddsd(dst, nds, as_Address(src)); 2949 } else { 2950 lea(rscratch1, src); 2951 vaddsd(dst, nds, Address(rscratch1, 0)); 2952 } 2953 } 2954 2955 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 2956 if (reachable(src)) { 2957 vaddss(dst, nds, as_Address(src)); 2958 } else { 2959 lea(rscratch1, src); 2960 vaddss(dst, nds, Address(rscratch1, 0)); 2961 } 2962 } 2963 2964 void MacroAssembler::vpaddd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register rscratch) { 2965 assert(UseAVX > 0, "requires some form of AVX"); 2966 if (reachable(src)) { 2967 Assembler::vpaddd(dst, nds, as_Address(src), vector_len); 2968 } else { 2969 lea(rscratch, src); 2970 Assembler::vpaddd(dst, nds, Address(rscratch, 0), vector_len); 2971 } 2972 } 2973 2974 void MacroAssembler::vabsss(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { 2975 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 2976 vandps(dst, nds, negate_field, vector_len); 2977 } 2978 2979 void MacroAssembler::vabssd(XMMRegister dst, XMMRegister nds, XMMRegister src, AddressLiteral negate_field, int vector_len) { 2980 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 2981 vandpd(dst, nds, negate_field, vector_len); 2982 } 2983 2984 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 2985 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2986 Assembler::vpaddb(dst, nds, src, vector_len); 2987 } 2988 2989 void MacroAssembler::vpaddb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 2990 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2991 Assembler::vpaddb(dst, nds, src, vector_len); 2992 } 2993 2994 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 2995 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 2996 Assembler::vpaddw(dst, nds, src, vector_len); 2997 } 2998 2999 void MacroAssembler::vpaddw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3000 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3001 Assembler::vpaddw(dst, nds, src, vector_len); 3002 } 3003 3004 void MacroAssembler::vpand(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3005 if (reachable(src)) { 3006 Assembler::vpand(dst, nds, as_Address(src), vector_len); 3007 } else { 3008 lea(scratch_reg, src); 3009 Assembler::vpand(dst, nds, Address(scratch_reg, 0), vector_len); 3010 } 3011 } 3012 3013 void MacroAssembler::vpbroadcastw(XMMRegister dst, XMMRegister src, int vector_len) { 3014 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3015 Assembler::vpbroadcastw(dst, src, vector_len); 3016 } 3017 3018 void MacroAssembler::vpcmpeqb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3019 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3020 Assembler::vpcmpeqb(dst, nds, src, vector_len); 3021 } 3022 3023 void MacroAssembler::vpcmpeqw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3024 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3025 Assembler::vpcmpeqw(dst, nds, src, vector_len); 3026 } 3027 3028 void MacroAssembler::vpmovzxbw(XMMRegister dst, Address src, int vector_len) { 3029 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3030 Assembler::vpmovzxbw(dst, src, vector_len); 3031 } 3032 3033 void MacroAssembler::vpmovmskb(Register dst, XMMRegister src) { 3034 assert((src->encoding() < 16),"XMM register should be 0-15"); 3035 Assembler::vpmovmskb(dst, src); 3036 } 3037 3038 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3039 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3040 Assembler::vpmullw(dst, nds, src, vector_len); 3041 } 3042 3043 void MacroAssembler::vpmullw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3044 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3045 Assembler::vpmullw(dst, nds, src, vector_len); 3046 } 3047 3048 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3049 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3050 Assembler::vpsubb(dst, nds, src, vector_len); 3051 } 3052 3053 void MacroAssembler::vpsubb(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3054 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3055 Assembler::vpsubb(dst, nds, src, vector_len); 3056 } 3057 3058 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, XMMRegister src, int vector_len) { 3059 assert(((dst->encoding() < 16 && src->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3060 Assembler::vpsubw(dst, nds, src, vector_len); 3061 } 3062 3063 void MacroAssembler::vpsubw(XMMRegister dst, XMMRegister nds, Address src, int vector_len) { 3064 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3065 Assembler::vpsubw(dst, nds, src, vector_len); 3066 } 3067 3068 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3069 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3070 Assembler::vpsraw(dst, nds, shift, vector_len); 3071 } 3072 3073 void MacroAssembler::vpsraw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3074 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3075 Assembler::vpsraw(dst, nds, shift, vector_len); 3076 } 3077 3078 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3079 assert(UseAVX > 2,""); 3080 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3081 vector_len = 2; 3082 } 3083 Assembler::evpsraq(dst, nds, shift, vector_len); 3084 } 3085 3086 void MacroAssembler::evpsraq(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3087 assert(UseAVX > 2,""); 3088 if (!VM_Version::supports_avx512vl() && vector_len < 2) { 3089 vector_len = 2; 3090 } 3091 Assembler::evpsraq(dst, nds, shift, vector_len); 3092 } 3093 3094 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3095 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3096 Assembler::vpsrlw(dst, nds, shift, vector_len); 3097 } 3098 3099 void MacroAssembler::vpsrlw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3100 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3101 Assembler::vpsrlw(dst, nds, shift, vector_len); 3102 } 3103 3104 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, XMMRegister shift, int vector_len) { 3105 assert(((dst->encoding() < 16 && shift->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3106 Assembler::vpsllw(dst, nds, shift, vector_len); 3107 } 3108 3109 void MacroAssembler::vpsllw(XMMRegister dst, XMMRegister nds, int shift, int vector_len) { 3110 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3111 Assembler::vpsllw(dst, nds, shift, vector_len); 3112 } 3113 3114 void MacroAssembler::vptest(XMMRegister dst, XMMRegister src) { 3115 assert((dst->encoding() < 16 && src->encoding() < 16),"XMM register should be 0-15"); 3116 Assembler::vptest(dst, src); 3117 } 3118 3119 void MacroAssembler::punpcklbw(XMMRegister dst, XMMRegister src) { 3120 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3121 Assembler::punpcklbw(dst, src); 3122 } 3123 3124 void MacroAssembler::pshufd(XMMRegister dst, Address src, int mode) { 3125 assert(((dst->encoding() < 16) || VM_Version::supports_avx512vl()),"XMM register should be 0-15"); 3126 Assembler::pshufd(dst, src, mode); 3127 } 3128 3129 void MacroAssembler::pshuflw(XMMRegister dst, XMMRegister src, int mode) { 3130 assert(((dst->encoding() < 16 && src->encoding() < 16) || VM_Version::supports_avx512vlbw()),"XMM register should be 0-15"); 3131 Assembler::pshuflw(dst, src, mode); 3132 } 3133 3134 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3135 if (reachable(src)) { 3136 vandpd(dst, nds, as_Address(src), vector_len); 3137 } else { 3138 lea(scratch_reg, src); 3139 vandpd(dst, nds, Address(scratch_reg, 0), vector_len); 3140 } 3141 } 3142 3143 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3144 if (reachable(src)) { 3145 vandps(dst, nds, as_Address(src), vector_len); 3146 } else { 3147 lea(scratch_reg, src); 3148 vandps(dst, nds, Address(scratch_reg, 0), vector_len); 3149 } 3150 } 3151 3152 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3153 if (reachable(src)) { 3154 vdivsd(dst, nds, as_Address(src)); 3155 } else { 3156 lea(rscratch1, src); 3157 vdivsd(dst, nds, Address(rscratch1, 0)); 3158 } 3159 } 3160 3161 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3162 if (reachable(src)) { 3163 vdivss(dst, nds, as_Address(src)); 3164 } else { 3165 lea(rscratch1, src); 3166 vdivss(dst, nds, Address(rscratch1, 0)); 3167 } 3168 } 3169 3170 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3171 if (reachable(src)) { 3172 vmulsd(dst, nds, as_Address(src)); 3173 } else { 3174 lea(rscratch1, src); 3175 vmulsd(dst, nds, Address(rscratch1, 0)); 3176 } 3177 } 3178 3179 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3180 if (reachable(src)) { 3181 vmulss(dst, nds, as_Address(src)); 3182 } else { 3183 lea(rscratch1, src); 3184 vmulss(dst, nds, Address(rscratch1, 0)); 3185 } 3186 } 3187 3188 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3189 if (reachable(src)) { 3190 vsubsd(dst, nds, as_Address(src)); 3191 } else { 3192 lea(rscratch1, src); 3193 vsubsd(dst, nds, Address(rscratch1, 0)); 3194 } 3195 } 3196 3197 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3198 if (reachable(src)) { 3199 vsubss(dst, nds, as_Address(src)); 3200 } else { 3201 lea(rscratch1, src); 3202 vsubss(dst, nds, Address(rscratch1, 0)); 3203 } 3204 } 3205 3206 void MacroAssembler::vnegatess(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3207 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3208 vxorps(dst, nds, src, Assembler::AVX_128bit); 3209 } 3210 3211 void MacroAssembler::vnegatesd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3212 assert(((dst->encoding() < 16 && nds->encoding() < 16) || VM_Version::supports_avx512vldq()),"XMM register should be 0-15"); 3213 vxorpd(dst, nds, src, Assembler::AVX_128bit); 3214 } 3215 3216 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3217 if (reachable(src)) { 3218 vxorpd(dst, nds, as_Address(src), vector_len); 3219 } else { 3220 lea(scratch_reg, src); 3221 vxorpd(dst, nds, Address(scratch_reg, 0), vector_len); 3222 } 3223 } 3224 3225 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3226 if (reachable(src)) { 3227 vxorps(dst, nds, as_Address(src), vector_len); 3228 } else { 3229 lea(scratch_reg, src); 3230 vxorps(dst, nds, Address(scratch_reg, 0), vector_len); 3231 } 3232 } 3233 3234 void MacroAssembler::vpxor(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len, Register scratch_reg) { 3235 if (UseAVX > 1 || (vector_len < 1)) { 3236 if (reachable(src)) { 3237 Assembler::vpxor(dst, nds, as_Address(src), vector_len); 3238 } else { 3239 lea(scratch_reg, src); 3240 Assembler::vpxor(dst, nds, Address(scratch_reg, 0), vector_len); 3241 } 3242 } 3243 else { 3244 MacroAssembler::vxorpd(dst, nds, src, vector_len, scratch_reg); 3245 } 3246 } 3247 3248 //------------------------------------------------------------------------------------------- 3249 3250 void MacroAssembler::clear_jweak_tag(Register possibly_jweak) { 3251 const int32_t inverted_jweak_mask = ~static_cast<int32_t>(JNIHandles::weak_tag_mask); 3252 STATIC_ASSERT(inverted_jweak_mask == -2); // otherwise check this code 3253 // The inverted mask is sign-extended 3254 andptr(possibly_jweak, inverted_jweak_mask); 3255 } 3256 3257 void MacroAssembler::resolve_jobject(Register value, 3258 Register thread, 3259 Register tmp) { 3260 assert_different_registers(value, thread, tmp); 3261 Label done, not_weak; 3262 testptr(value, value); 3263 jcc(Assembler::zero, done); // Use NULL as-is. 3264 testptr(value, JNIHandles::weak_tag_mask); // Test for jweak tag. 3265 jcc(Assembler::zero, not_weak); 3266 // Resolve jweak. 3267 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 3268 value, Address(value, -JNIHandles::weak_tag_value), tmp, thread); 3269 verify_oop(value); 3270 jmp(done); 3271 bind(not_weak); 3272 // Resolve (untagged) jobject. 3273 access_load_at(T_OBJECT, IN_NATIVE, value, Address(value, 0), tmp, thread); 3274 verify_oop(value); 3275 bind(done); 3276 } 3277 3278 void MacroAssembler::subptr(Register dst, int32_t imm32) { 3279 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 3280 } 3281 3282 // Force generation of a 4 byte immediate value even if it fits into 8bit 3283 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 3284 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 3285 } 3286 3287 void MacroAssembler::subptr(Register dst, Register src) { 3288 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 3289 } 3290 3291 // C++ bool manipulation 3292 void MacroAssembler::testbool(Register dst) { 3293 if(sizeof(bool) == 1) 3294 testb(dst, 0xff); 3295 else if(sizeof(bool) == 2) { 3296 // testw implementation needed for two byte bools 3297 ShouldNotReachHere(); 3298 } else if(sizeof(bool) == 4) 3299 testl(dst, dst); 3300 else 3301 // unsupported 3302 ShouldNotReachHere(); 3303 } 3304 3305 void MacroAssembler::testptr(Register dst, Register src) { 3306 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 3307 } 3308 3309 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 3310 void MacroAssembler::tlab_allocate(Register thread, Register obj, 3311 Register var_size_in_bytes, 3312 int con_size_in_bytes, 3313 Register t1, 3314 Register t2, 3315 Label& slow_case) { 3316 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3317 bs->tlab_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 3318 } 3319 3320 // Defines obj, preserves var_size_in_bytes 3321 void MacroAssembler::eden_allocate(Register thread, Register obj, 3322 Register var_size_in_bytes, 3323 int con_size_in_bytes, 3324 Register t1, 3325 Label& slow_case) { 3326 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 3327 bs->eden_allocate(this, thread, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 3328 } 3329 3330 // Preserves the contents of address, destroys the contents length_in_bytes and temp. 3331 void MacroAssembler::zero_memory(Register address, Register length_in_bytes, int offset_in_bytes, Register temp) { 3332 assert(address != length_in_bytes && address != temp && temp != length_in_bytes, "registers must be different"); 3333 assert((offset_in_bytes & (BytesPerWord - 1)) == 0, "offset must be a multiple of BytesPerWord"); 3334 Label done; 3335 3336 testptr(length_in_bytes, length_in_bytes); 3337 jcc(Assembler::zero, done); 3338 3339 // initialize topmost word, divide index by 2, check if odd and test if zero 3340 // note: for the remaining code to work, index must be a multiple of BytesPerWord 3341 #ifdef ASSERT 3342 { 3343 Label L; 3344 testptr(length_in_bytes, BytesPerWord - 1); 3345 jcc(Assembler::zero, L); 3346 stop("length must be a multiple of BytesPerWord"); 3347 bind(L); 3348 } 3349 #endif 3350 Register index = length_in_bytes; 3351 xorptr(temp, temp); // use _zero reg to clear memory (shorter code) 3352 if (UseIncDec) { 3353 shrptr(index, 3); // divide by 8/16 and set carry flag if bit 2 was set 3354 } else { 3355 shrptr(index, 2); // use 2 instructions to avoid partial flag stall 3356 shrptr(index, 1); 3357 } 3358 #ifndef _LP64 3359 // index could have not been a multiple of 8 (i.e., bit 2 was set) 3360 { 3361 Label even; 3362 // note: if index was a multiple of 8, then it cannot 3363 // be 0 now otherwise it must have been 0 before 3364 // => if it is even, we don't need to check for 0 again 3365 jcc(Assembler::carryClear, even); 3366 // clear topmost word (no jump would be needed if conditional assignment worked here) 3367 movptr(Address(address, index, Address::times_8, offset_in_bytes - 0*BytesPerWord), temp); 3368 // index could be 0 now, must check again 3369 jcc(Assembler::zero, done); 3370 bind(even); 3371 } 3372 #endif // !_LP64 3373 // initialize remaining object fields: index is a multiple of 2 now 3374 { 3375 Label loop; 3376 bind(loop); 3377 movptr(Address(address, index, Address::times_8, offset_in_bytes - 1*BytesPerWord), temp); 3378 NOT_LP64(movptr(Address(address, index, Address::times_8, offset_in_bytes - 2*BytesPerWord), temp);) 3379 decrement(index); 3380 jcc(Assembler::notZero, loop); 3381 } 3382 3383 bind(done); 3384 } 3385 3386 // Look up the method for a megamorphic invokeinterface call. 3387 // The target method is determined by <intf_klass, itable_index>. 3388 // The receiver klass is in recv_klass. 3389 // On success, the result will be in method_result, and execution falls through. 3390 // On failure, execution transfers to the given label. 3391 void MacroAssembler::lookup_interface_method(Register recv_klass, 3392 Register intf_klass, 3393 RegisterOrConstant itable_index, 3394 Register method_result, 3395 Register scan_temp, 3396 Label& L_no_such_interface, 3397 bool return_method) { 3398 assert_different_registers(recv_klass, intf_klass, scan_temp); 3399 assert_different_registers(method_result, intf_klass, scan_temp); 3400 assert(recv_klass != method_result || !return_method, 3401 "recv_klass can be destroyed when method isn't needed"); 3402 3403 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 3404 "caller must use same register for non-constant itable index as for method"); 3405 3406 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 3407 int vtable_base = in_bytes(Klass::vtable_start_offset()); 3408 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 3409 int scan_step = itableOffsetEntry::size() * wordSize; 3410 int vte_size = vtableEntry::size_in_bytes(); 3411 Address::ScaleFactor times_vte_scale = Address::times_ptr; 3412 assert(vte_size == wordSize, "else adjust times_vte_scale"); 3413 3414 movl(scan_temp, Address(recv_klass, Klass::vtable_length_offset())); 3415 3416 // %%% Could store the aligned, prescaled offset in the klassoop. 3417 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 3418 3419 if (return_method) { 3420 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 3421 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 3422 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 3423 } 3424 3425 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 3426 // if (scan->interface() == intf) { 3427 // result = (klass + scan->offset() + itable_index); 3428 // } 3429 // } 3430 Label search, found_method; 3431 3432 for (int peel = 1; peel >= 0; peel--) { 3433 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 3434 cmpptr(intf_klass, method_result); 3435 3436 if (peel) { 3437 jccb(Assembler::equal, found_method); 3438 } else { 3439 jccb(Assembler::notEqual, search); 3440 // (invert the test to fall through to found_method...) 3441 } 3442 3443 if (!peel) break; 3444 3445 bind(search); 3446 3447 // Check that the previous entry is non-null. A null entry means that 3448 // the receiver class doesn't implement the interface, and wasn't the 3449 // same as when the caller was compiled. 3450 testptr(method_result, method_result); 3451 jcc(Assembler::zero, L_no_such_interface); 3452 addptr(scan_temp, scan_step); 3453 } 3454 3455 bind(found_method); 3456 3457 if (return_method) { 3458 // Got a hit. 3459 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 3460 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 3461 } 3462 } 3463 3464 3465 // virtual method calling 3466 void MacroAssembler::lookup_virtual_method(Register recv_klass, 3467 RegisterOrConstant vtable_index, 3468 Register method_result) { 3469 const int base = in_bytes(Klass::vtable_start_offset()); 3470 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 3471 Address vtable_entry_addr(recv_klass, 3472 vtable_index, Address::times_ptr, 3473 base + vtableEntry::method_offset_in_bytes()); 3474 movptr(method_result, vtable_entry_addr); 3475 } 3476 3477 3478 void MacroAssembler::check_klass_subtype(Register sub_klass, 3479 Register super_klass, 3480 Register temp_reg, 3481 Label& L_success) { 3482 Label L_failure; 3483 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 3484 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 3485 bind(L_failure); 3486 } 3487 3488 3489 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 3490 Register super_klass, 3491 Register temp_reg, 3492 Label* L_success, 3493 Label* L_failure, 3494 Label* L_slow_path, 3495 RegisterOrConstant super_check_offset) { 3496 assert_different_registers(sub_klass, super_klass, temp_reg); 3497 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 3498 if (super_check_offset.is_register()) { 3499 assert_different_registers(sub_klass, super_klass, 3500 super_check_offset.as_register()); 3501 } else if (must_load_sco) { 3502 assert(temp_reg != noreg, "supply either a temp or a register offset"); 3503 } 3504 3505 Label L_fallthrough; 3506 int label_nulls = 0; 3507 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 3508 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 3509 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 3510 assert(label_nulls <= 1, "at most one NULL in the batch"); 3511 3512 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 3513 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3514 Address super_check_offset_addr(super_klass, sco_offset); 3515 3516 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 3517 // range of a jccb. If this routine grows larger, reconsider at 3518 // least some of these. 3519 #define local_jcc(assembler_cond, label) \ 3520 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 3521 else jcc( assembler_cond, label) /*omit semi*/ 3522 3523 // Hacked jmp, which may only be used just before L_fallthrough. 3524 #define final_jmp(label) \ 3525 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 3526 else jmp(label) /*omit semi*/ 3527 3528 // If the pointers are equal, we are done (e.g., String[] elements). 3529 // This self-check enables sharing of secondary supertype arrays among 3530 // non-primary types such as array-of-interface. Otherwise, each such 3531 // type would need its own customized SSA. 3532 // We move this check to the front of the fast path because many 3533 // type checks are in fact trivially successful in this manner, 3534 // so we get a nicely predicted branch right at the start of the check. 3535 cmpptr(sub_klass, super_klass); 3536 local_jcc(Assembler::equal, *L_success); 3537 3538 // Check the supertype display: 3539 if (must_load_sco) { 3540 // Positive movl does right thing on LP64. 3541 movl(temp_reg, super_check_offset_addr); 3542 super_check_offset = RegisterOrConstant(temp_reg); 3543 } 3544 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 3545 cmpptr(super_klass, super_check_addr); // load displayed supertype 3546 3547 // This check has worked decisively for primary supers. 3548 // Secondary supers are sought in the super_cache ('super_cache_addr'). 3549 // (Secondary supers are interfaces and very deeply nested subtypes.) 3550 // This works in the same check above because of a tricky aliasing 3551 // between the super_cache and the primary super display elements. 3552 // (The 'super_check_addr' can address either, as the case requires.) 3553 // Note that the cache is updated below if it does not help us find 3554 // what we need immediately. 3555 // So if it was a primary super, we can just fail immediately. 3556 // Otherwise, it's the slow path for us (no success at this point). 3557 3558 if (super_check_offset.is_register()) { 3559 local_jcc(Assembler::equal, *L_success); 3560 cmpl(super_check_offset.as_register(), sc_offset); 3561 if (L_failure == &L_fallthrough) { 3562 local_jcc(Assembler::equal, *L_slow_path); 3563 } else { 3564 local_jcc(Assembler::notEqual, *L_failure); 3565 final_jmp(*L_slow_path); 3566 } 3567 } else if (super_check_offset.as_constant() == sc_offset) { 3568 // Need a slow path; fast failure is impossible. 3569 if (L_slow_path == &L_fallthrough) { 3570 local_jcc(Assembler::equal, *L_success); 3571 } else { 3572 local_jcc(Assembler::notEqual, *L_slow_path); 3573 final_jmp(*L_success); 3574 } 3575 } else { 3576 // No slow path; it's a fast decision. 3577 if (L_failure == &L_fallthrough) { 3578 local_jcc(Assembler::equal, *L_success); 3579 } else { 3580 local_jcc(Assembler::notEqual, *L_failure); 3581 final_jmp(*L_success); 3582 } 3583 } 3584 3585 bind(L_fallthrough); 3586 3587 #undef local_jcc 3588 #undef final_jmp 3589 } 3590 3591 3592 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 3593 Register super_klass, 3594 Register temp_reg, 3595 Register temp2_reg, 3596 Label* L_success, 3597 Label* L_failure, 3598 bool set_cond_codes) { 3599 assert_different_registers(sub_klass, super_klass, temp_reg); 3600 if (temp2_reg != noreg) 3601 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 3602 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 3603 3604 Label L_fallthrough; 3605 int label_nulls = 0; 3606 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 3607 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 3608 assert(label_nulls <= 1, "at most one NULL in the batch"); 3609 3610 // a couple of useful fields in sub_klass: 3611 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 3612 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 3613 Address secondary_supers_addr(sub_klass, ss_offset); 3614 Address super_cache_addr( sub_klass, sc_offset); 3615 3616 // Do a linear scan of the secondary super-klass chain. 3617 // This code is rarely used, so simplicity is a virtue here. 3618 // The repne_scan instruction uses fixed registers, which we must spill. 3619 // Don't worry too much about pre-existing connections with the input regs. 3620 3621 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 3622 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 3623 3624 // Get super_klass value into rax (even if it was in rdi or rcx). 3625 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 3626 if (super_klass != rax || UseCompressedOops) { 3627 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 3628 mov(rax, super_klass); 3629 } 3630 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 3631 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 3632 3633 #ifndef PRODUCT 3634 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 3635 ExternalAddress pst_counter_addr((address) pst_counter); 3636 NOT_LP64( incrementl(pst_counter_addr) ); 3637 LP64_ONLY( lea(rcx, pst_counter_addr) ); 3638 LP64_ONLY( incrementl(Address(rcx, 0)) ); 3639 #endif //PRODUCT 3640 3641 // We will consult the secondary-super array. 3642 movptr(rdi, secondary_supers_addr); 3643 // Load the array length. (Positive movl does right thing on LP64.) 3644 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 3645 // Skip to start of data. 3646 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 3647 3648 // Scan RCX words at [RDI] for an occurrence of RAX. 3649 // Set NZ/Z based on last compare. 3650 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 3651 // not change flags (only scas instruction which is repeated sets flags). 3652 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 3653 3654 testptr(rax,rax); // Set Z = 0 3655 repne_scan(); 3656 3657 // Unspill the temp. registers: 3658 if (pushed_rdi) pop(rdi); 3659 if (pushed_rcx) pop(rcx); 3660 if (pushed_rax) pop(rax); 3661 3662 if (set_cond_codes) { 3663 // Special hack for the AD files: rdi is guaranteed non-zero. 3664 assert(!pushed_rdi, "rdi must be left non-NULL"); 3665 // Also, the condition codes are properly set Z/NZ on succeed/failure. 3666 } 3667 3668 if (L_failure == &L_fallthrough) 3669 jccb(Assembler::notEqual, *L_failure); 3670 else jcc(Assembler::notEqual, *L_failure); 3671 3672 // Success. Cache the super we found and proceed in triumph. 3673 movptr(super_cache_addr, super_klass); 3674 3675 if (L_success != &L_fallthrough) { 3676 jmp(*L_success); 3677 } 3678 3679 #undef IS_A_TEMP 3680 3681 bind(L_fallthrough); 3682 } 3683 3684 void MacroAssembler::clinit_barrier(Register klass, Register thread, Label* L_fast_path, Label* L_slow_path) { 3685 assert(L_fast_path != NULL || L_slow_path != NULL, "at least one is required"); 3686 3687 Label L_fallthrough; 3688 if (L_fast_path == NULL) { 3689 L_fast_path = &L_fallthrough; 3690 } else if (L_slow_path == NULL) { 3691 L_slow_path = &L_fallthrough; 3692 } 3693 3694 // Fast path check: class is fully initialized 3695 cmpb(Address(klass, InstanceKlass::init_state_offset()), InstanceKlass::fully_initialized); 3696 jcc(Assembler::equal, *L_fast_path); 3697 3698 // Fast path check: current thread is initializer thread 3699 cmpptr(thread, Address(klass, InstanceKlass::init_thread_offset())); 3700 if (L_slow_path == &L_fallthrough) { 3701 jcc(Assembler::equal, *L_fast_path); 3702 bind(*L_slow_path); 3703 } else if (L_fast_path == &L_fallthrough) { 3704 jcc(Assembler::notEqual, *L_slow_path); 3705 bind(*L_fast_path); 3706 } else { 3707 Unimplemented(); 3708 } 3709 } 3710 3711 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 3712 if (VM_Version::supports_cmov()) { 3713 cmovl(cc, dst, src); 3714 } else { 3715 Label L; 3716 jccb(negate_condition(cc), L); 3717 movl(dst, src); 3718 bind(L); 3719 } 3720 } 3721 3722 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 3723 if (VM_Version::supports_cmov()) { 3724 cmovl(cc, dst, src); 3725 } else { 3726 Label L; 3727 jccb(negate_condition(cc), L); 3728 movl(dst, src); 3729 bind(L); 3730 } 3731 } 3732 3733 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) { 3734 if (!VerifyOops) return; 3735 3736 // Pass register number to verify_oop_subroutine 3737 const char* b = NULL; 3738 { 3739 ResourceMark rm; 3740 stringStream ss; 3741 ss.print("verify_oop: %s: %s (%s:%d)", reg->name(), s, file, line); 3742 b = code_string(ss.as_string()); 3743 } 3744 BLOCK_COMMENT("verify_oop {"); 3745 #ifdef _LP64 3746 push(rscratch1); // save r10, trashed by movptr() 3747 #endif 3748 push(rax); // save rax, 3749 push(reg); // pass register argument 3750 ExternalAddress buffer((address) b); 3751 // avoid using pushptr, as it modifies scratch registers 3752 // and our contract is not to modify anything 3753 movptr(rax, buffer.addr()); 3754 push(rax); 3755 // call indirectly to solve generation ordering problem 3756 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 3757 call(rax); 3758 // Caller pops the arguments (oop, message) and restores rax, r10 3759 BLOCK_COMMENT("} verify_oop"); 3760 } 3761 3762 void MacroAssembler::vallones(XMMRegister dst, int vector_len) { 3763 if (UseAVX > 2 && (vector_len == Assembler::AVX_512bit || VM_Version::supports_avx512vl())) { 3764 vpternlogd(dst, 0xFF, dst, dst, vector_len); 3765 } else { 3766 assert(UseAVX > 0, ""); 3767 vpcmpeqb(dst, dst, dst, vector_len); 3768 } 3769 } 3770 3771 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 3772 Register tmp, 3773 int offset) { 3774 intptr_t value = *delayed_value_addr; 3775 if (value != 0) 3776 return RegisterOrConstant(value + offset); 3777 3778 // load indirectly to solve generation ordering problem 3779 movptr(tmp, ExternalAddress((address) delayed_value_addr)); 3780 3781 #ifdef ASSERT 3782 { Label L; 3783 testptr(tmp, tmp); 3784 if (WizardMode) { 3785 const char* buf = NULL; 3786 { 3787 ResourceMark rm; 3788 stringStream ss; 3789 ss.print("DelayedValue=" INTPTR_FORMAT, delayed_value_addr[1]); 3790 buf = code_string(ss.as_string()); 3791 } 3792 jcc(Assembler::notZero, L); 3793 STOP(buf); 3794 } else { 3795 jccb(Assembler::notZero, L); 3796 hlt(); 3797 } 3798 bind(L); 3799 } 3800 #endif 3801 3802 if (offset != 0) 3803 addptr(tmp, offset); 3804 3805 return RegisterOrConstant(tmp); 3806 } 3807 3808 3809 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 3810 int extra_slot_offset) { 3811 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 3812 int stackElementSize = Interpreter::stackElementSize; 3813 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 3814 #ifdef ASSERT 3815 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 3816 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 3817 #endif 3818 Register scale_reg = noreg; 3819 Address::ScaleFactor scale_factor = Address::no_scale; 3820 if (arg_slot.is_constant()) { 3821 offset += arg_slot.as_constant() * stackElementSize; 3822 } else { 3823 scale_reg = arg_slot.as_register(); 3824 scale_factor = Address::times(stackElementSize); 3825 } 3826 offset += wordSize; // return PC is on stack 3827 return Address(rsp, scale_reg, scale_factor, offset); 3828 } 3829 3830 3831 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) { 3832 if (!VerifyOops) return; 3833 3834 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); 3835 // Pass register number to verify_oop_subroutine 3836 const char* b = NULL; 3837 { 3838 ResourceMark rm; 3839 stringStream ss; 3840 ss.print("verify_oop_addr: %s (%s:%d)", s, file, line); 3841 b = code_string(ss.as_string()); 3842 } 3843 #ifdef _LP64 3844 push(rscratch1); // save r10, trashed by movptr() 3845 #endif 3846 push(rax); // save rax, 3847 // addr may contain rsp so we will have to adjust it based on the push 3848 // we just did (and on 64 bit we do two pushes) 3849 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 3850 // stores rax into addr which is backwards of what was intended. 3851 if (addr.uses(rsp)) { 3852 lea(rax, addr); 3853 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 3854 } else { 3855 pushptr(addr); 3856 } 3857 3858 ExternalAddress buffer((address) b); 3859 // pass msg argument 3860 // avoid using pushptr, as it modifies scratch registers 3861 // and our contract is not to modify anything 3862 movptr(rax, buffer.addr()); 3863 push(rax); 3864 3865 // call indirectly to solve generation ordering problem 3866 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 3867 call(rax); 3868 // Caller pops the arguments (addr, message) and restores rax, r10. 3869 } 3870 3871 void MacroAssembler::verify_tlab() { 3872 #ifdef ASSERT 3873 if (UseTLAB && VerifyOops) { 3874 Label next, ok; 3875 Register t1 = rsi; 3876 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 3877 3878 push(t1); 3879 NOT_LP64(push(thread_reg)); 3880 NOT_LP64(get_thread(thread_reg)); 3881 3882 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 3883 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 3884 jcc(Assembler::aboveEqual, next); 3885 STOP("assert(top >= start)"); 3886 should_not_reach_here(); 3887 3888 bind(next); 3889 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 3890 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 3891 jcc(Assembler::aboveEqual, ok); 3892 STOP("assert(top <= end)"); 3893 should_not_reach_here(); 3894 3895 bind(ok); 3896 NOT_LP64(pop(thread_reg)); 3897 pop(t1); 3898 } 3899 #endif 3900 } 3901 3902 class ControlWord { 3903 public: 3904 int32_t _value; 3905 3906 int rounding_control() const { return (_value >> 10) & 3 ; } 3907 int precision_control() const { return (_value >> 8) & 3 ; } 3908 bool precision() const { return ((_value >> 5) & 1) != 0; } 3909 bool underflow() const { return ((_value >> 4) & 1) != 0; } 3910 bool overflow() const { return ((_value >> 3) & 1) != 0; } 3911 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 3912 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 3913 bool invalid() const { return ((_value >> 0) & 1) != 0; } 3914 3915 void print() const { 3916 // rounding control 3917 const char* rc; 3918 switch (rounding_control()) { 3919 case 0: rc = "round near"; break; 3920 case 1: rc = "round down"; break; 3921 case 2: rc = "round up "; break; 3922 case 3: rc = "chop "; break; 3923 }; 3924 // precision control 3925 const char* pc; 3926 switch (precision_control()) { 3927 case 0: pc = "24 bits "; break; 3928 case 1: pc = "reserved"; break; 3929 case 2: pc = "53 bits "; break; 3930 case 3: pc = "64 bits "; break; 3931 }; 3932 // flags 3933 char f[9]; 3934 f[0] = ' '; 3935 f[1] = ' '; 3936 f[2] = (precision ()) ? 'P' : 'p'; 3937 f[3] = (underflow ()) ? 'U' : 'u'; 3938 f[4] = (overflow ()) ? 'O' : 'o'; 3939 f[5] = (zero_divide ()) ? 'Z' : 'z'; 3940 f[6] = (denormalized()) ? 'D' : 'd'; 3941 f[7] = (invalid ()) ? 'I' : 'i'; 3942 f[8] = '\x0'; 3943 // output 3944 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 3945 } 3946 3947 }; 3948 3949 class StatusWord { 3950 public: 3951 int32_t _value; 3952 3953 bool busy() const { return ((_value >> 15) & 1) != 0; } 3954 bool C3() const { return ((_value >> 14) & 1) != 0; } 3955 bool C2() const { return ((_value >> 10) & 1) != 0; } 3956 bool C1() const { return ((_value >> 9) & 1) != 0; } 3957 bool C0() const { return ((_value >> 8) & 1) != 0; } 3958 int top() const { return (_value >> 11) & 7 ; } 3959 bool error_status() const { return ((_value >> 7) & 1) != 0; } 3960 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 3961 bool precision() const { return ((_value >> 5) & 1) != 0; } 3962 bool underflow() const { return ((_value >> 4) & 1) != 0; } 3963 bool overflow() const { return ((_value >> 3) & 1) != 0; } 3964 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 3965 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 3966 bool invalid() const { return ((_value >> 0) & 1) != 0; } 3967 3968 void print() const { 3969 // condition codes 3970 char c[5]; 3971 c[0] = (C3()) ? '3' : '-'; 3972 c[1] = (C2()) ? '2' : '-'; 3973 c[2] = (C1()) ? '1' : '-'; 3974 c[3] = (C0()) ? '0' : '-'; 3975 c[4] = '\x0'; 3976 // flags 3977 char f[9]; 3978 f[0] = (error_status()) ? 'E' : '-'; 3979 f[1] = (stack_fault ()) ? 'S' : '-'; 3980 f[2] = (precision ()) ? 'P' : '-'; 3981 f[3] = (underflow ()) ? 'U' : '-'; 3982 f[4] = (overflow ()) ? 'O' : '-'; 3983 f[5] = (zero_divide ()) ? 'Z' : '-'; 3984 f[6] = (denormalized()) ? 'D' : '-'; 3985 f[7] = (invalid ()) ? 'I' : '-'; 3986 f[8] = '\x0'; 3987 // output 3988 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 3989 } 3990 3991 }; 3992 3993 class TagWord { 3994 public: 3995 int32_t _value; 3996 3997 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 3998 3999 void print() const { 4000 printf("%04x", _value & 0xFFFF); 4001 } 4002 4003 }; 4004 4005 class FPU_Register { 4006 public: 4007 int32_t _m0; 4008 int32_t _m1; 4009 int16_t _ex; 4010 4011 bool is_indefinite() const { 4012 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 4013 } 4014 4015 void print() const { 4016 char sign = (_ex < 0) ? '-' : '+'; 4017 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 4018 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 4019 }; 4020 4021 }; 4022 4023 class FPU_State { 4024 public: 4025 enum { 4026 register_size = 10, 4027 number_of_registers = 8, 4028 register_mask = 7 4029 }; 4030 4031 ControlWord _control_word; 4032 StatusWord _status_word; 4033 TagWord _tag_word; 4034 int32_t _error_offset; 4035 int32_t _error_selector; 4036 int32_t _data_offset; 4037 int32_t _data_selector; 4038 int8_t _register[register_size * number_of_registers]; 4039 4040 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 4041 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 4042 4043 const char* tag_as_string(int tag) const { 4044 switch (tag) { 4045 case 0: return "valid"; 4046 case 1: return "zero"; 4047 case 2: return "special"; 4048 case 3: return "empty"; 4049 } 4050 ShouldNotReachHere(); 4051 return NULL; 4052 } 4053 4054 void print() const { 4055 // print computation registers 4056 { int t = _status_word.top(); 4057 for (int i = 0; i < number_of_registers; i++) { 4058 int j = (i - t) & register_mask; 4059 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 4060 st(j)->print(); 4061 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 4062 } 4063 } 4064 printf("\n"); 4065 // print control registers 4066 printf("ctrl = "); _control_word.print(); printf("\n"); 4067 printf("stat = "); _status_word .print(); printf("\n"); 4068 printf("tags = "); _tag_word .print(); printf("\n"); 4069 } 4070 4071 }; 4072 4073 class Flag_Register { 4074 public: 4075 int32_t _value; 4076 4077 bool overflow() const { return ((_value >> 11) & 1) != 0; } 4078 bool direction() const { return ((_value >> 10) & 1) != 0; } 4079 bool sign() const { return ((_value >> 7) & 1) != 0; } 4080 bool zero() const { return ((_value >> 6) & 1) != 0; } 4081 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 4082 bool parity() const { return ((_value >> 2) & 1) != 0; } 4083 bool carry() const { return ((_value >> 0) & 1) != 0; } 4084 4085 void print() const { 4086 // flags 4087 char f[8]; 4088 f[0] = (overflow ()) ? 'O' : '-'; 4089 f[1] = (direction ()) ? 'D' : '-'; 4090 f[2] = (sign ()) ? 'S' : '-'; 4091 f[3] = (zero ()) ? 'Z' : '-'; 4092 f[4] = (auxiliary_carry()) ? 'A' : '-'; 4093 f[5] = (parity ()) ? 'P' : '-'; 4094 f[6] = (carry ()) ? 'C' : '-'; 4095 f[7] = '\x0'; 4096 // output 4097 printf("%08x flags = %s", _value, f); 4098 } 4099 4100 }; 4101 4102 class IU_Register { 4103 public: 4104 int32_t _value; 4105 4106 void print() const { 4107 printf("%08x %11d", _value, _value); 4108 } 4109 4110 }; 4111 4112 class IU_State { 4113 public: 4114 Flag_Register _eflags; 4115 IU_Register _rdi; 4116 IU_Register _rsi; 4117 IU_Register _rbp; 4118 IU_Register _rsp; 4119 IU_Register _rbx; 4120 IU_Register _rdx; 4121 IU_Register _rcx; 4122 IU_Register _rax; 4123 4124 void print() const { 4125 // computation registers 4126 printf("rax, = "); _rax.print(); printf("\n"); 4127 printf("rbx, = "); _rbx.print(); printf("\n"); 4128 printf("rcx = "); _rcx.print(); printf("\n"); 4129 printf("rdx = "); _rdx.print(); printf("\n"); 4130 printf("rdi = "); _rdi.print(); printf("\n"); 4131 printf("rsi = "); _rsi.print(); printf("\n"); 4132 printf("rbp, = "); _rbp.print(); printf("\n"); 4133 printf("rsp = "); _rsp.print(); printf("\n"); 4134 printf("\n"); 4135 // control registers 4136 printf("flgs = "); _eflags.print(); printf("\n"); 4137 } 4138 }; 4139 4140 4141 class CPU_State { 4142 public: 4143 FPU_State _fpu_state; 4144 IU_State _iu_state; 4145 4146 void print() const { 4147 printf("--------------------------------------------------\n"); 4148 _iu_state .print(); 4149 printf("\n"); 4150 _fpu_state.print(); 4151 printf("--------------------------------------------------\n"); 4152 } 4153 4154 }; 4155 4156 4157 static void _print_CPU_state(CPU_State* state) { 4158 state->print(); 4159 }; 4160 4161 4162 void MacroAssembler::print_CPU_state() { 4163 push_CPU_state(); 4164 push(rsp); // pass CPU state 4165 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 4166 addptr(rsp, wordSize); // discard argument 4167 pop_CPU_state(); 4168 } 4169 4170 4171 #ifndef _LP64 4172 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 4173 static int counter = 0; 4174 FPU_State* fs = &state->_fpu_state; 4175 counter++; 4176 // For leaf calls, only verify that the top few elements remain empty. 4177 // We only need 1 empty at the top for C2 code. 4178 if( stack_depth < 0 ) { 4179 if( fs->tag_for_st(7) != 3 ) { 4180 printf("FPR7 not empty\n"); 4181 state->print(); 4182 assert(false, "error"); 4183 return false; 4184 } 4185 return true; // All other stack states do not matter 4186 } 4187 4188 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std, 4189 "bad FPU control word"); 4190 4191 // compute stack depth 4192 int i = 0; 4193 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 4194 int d = i; 4195 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 4196 // verify findings 4197 if (i != FPU_State::number_of_registers) { 4198 // stack not contiguous 4199 printf("%s: stack not contiguous at ST%d\n", s, i); 4200 state->print(); 4201 assert(false, "error"); 4202 return false; 4203 } 4204 // check if computed stack depth corresponds to expected stack depth 4205 if (stack_depth < 0) { 4206 // expected stack depth is -stack_depth or less 4207 if (d > -stack_depth) { 4208 // too many elements on the stack 4209 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 4210 state->print(); 4211 assert(false, "error"); 4212 return false; 4213 } 4214 } else { 4215 // expected stack depth is stack_depth 4216 if (d != stack_depth) { 4217 // wrong stack depth 4218 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 4219 state->print(); 4220 assert(false, "error"); 4221 return false; 4222 } 4223 } 4224 // everything is cool 4225 return true; 4226 } 4227 4228 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 4229 if (!VerifyFPU) return; 4230 push_CPU_state(); 4231 push(rsp); // pass CPU state 4232 ExternalAddress msg((address) s); 4233 // pass message string s 4234 pushptr(msg.addr()); 4235 push(stack_depth); // pass stack depth 4236 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 4237 addptr(rsp, 3 * wordSize); // discard arguments 4238 // check for error 4239 { Label L; 4240 testl(rax, rax); 4241 jcc(Assembler::notZero, L); 4242 int3(); // break if error condition 4243 bind(L); 4244 } 4245 pop_CPU_state(); 4246 } 4247 #endif // _LP64 4248 4249 void MacroAssembler::restore_cpu_control_state_after_jni() { 4250 // Either restore the MXCSR register after returning from the JNI Call 4251 // or verify that it wasn't changed (with -Xcheck:jni flag). 4252 if (VM_Version::supports_sse()) { 4253 if (RestoreMXCSROnJNICalls) { 4254 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); 4255 } else if (CheckJNICalls) { 4256 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 4257 } 4258 } 4259 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 4260 vzeroupper(); 4261 // Reset k1 to 0xffff. 4262 4263 #ifdef COMPILER2 4264 if (PostLoopMultiversioning && VM_Version::supports_evex()) { 4265 push(rcx); 4266 movl(rcx, 0xffff); 4267 kmovwl(k1, rcx); 4268 pop(rcx); 4269 } 4270 #endif // COMPILER2 4271 4272 #ifndef _LP64 4273 // Either restore the x87 floating pointer control word after returning 4274 // from the JNI call or verify that it wasn't changed. 4275 if (CheckJNICalls) { 4276 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 4277 } 4278 #endif // _LP64 4279 } 4280 4281 // ((OopHandle)result).resolve(); 4282 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 4283 assert_different_registers(result, tmp); 4284 4285 // Only 64 bit platforms support GCs that require a tmp register 4286 // Only IN_HEAP loads require a thread_tmp register 4287 // OopHandle::resolve is an indirection like jobject. 4288 access_load_at(T_OBJECT, IN_NATIVE, 4289 result, Address(result, 0), tmp, /*tmp_thread*/noreg); 4290 } 4291 4292 // ((WeakHandle)result).resolve(); 4293 void MacroAssembler::resolve_weak_handle(Register rresult, Register rtmp) { 4294 assert_different_registers(rresult, rtmp); 4295 Label resolved; 4296 4297 // A null weak handle resolves to null. 4298 cmpptr(rresult, 0); 4299 jcc(Assembler::equal, resolved); 4300 4301 // Only 64 bit platforms support GCs that require a tmp register 4302 // Only IN_HEAP loads require a thread_tmp register 4303 // WeakHandle::resolve is an indirection like jweak. 4304 access_load_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 4305 rresult, Address(rresult, 0), rtmp, /*tmp_thread*/noreg); 4306 bind(resolved); 4307 } 4308 4309 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) { 4310 // get mirror 4311 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 4312 load_method_holder(mirror, method); 4313 movptr(mirror, Address(mirror, mirror_offset)); 4314 resolve_oop_handle(mirror, tmp); 4315 } 4316 4317 void MacroAssembler::load_method_holder_cld(Register rresult, Register rmethod) { 4318 load_method_holder(rresult, rmethod); 4319 movptr(rresult, Address(rresult, InstanceKlass::class_loader_data_offset())); 4320 } 4321 4322 void MacroAssembler::load_method_holder(Register holder, Register method) { 4323 movptr(holder, Address(method, Method::const_offset())); // ConstMethod* 4324 movptr(holder, Address(holder, ConstMethod::constants_offset())); // ConstantPool* 4325 movptr(holder, Address(holder, ConstantPool::pool_holder_offset_in_bytes())); // InstanceKlass* 4326 } 4327 4328 void MacroAssembler::load_klass(Register dst, Register src, Register tmp) { 4329 assert_different_registers(src, tmp); 4330 assert_different_registers(dst, tmp); 4331 #ifdef _LP64 4332 if (UseCompressedClassPointers) { 4333 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 4334 decode_klass_not_null(dst, tmp); 4335 } else 4336 #endif 4337 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 4338 } 4339 4340 void MacroAssembler::load_prototype_header(Register dst, Register src, Register tmp) { 4341 load_klass(dst, src, tmp); 4342 movptr(dst, Address(dst, Klass::prototype_header_offset())); 4343 } 4344 4345 void MacroAssembler::store_klass(Register dst, Register src, Register tmp) { 4346 assert_different_registers(src, tmp); 4347 assert_different_registers(dst, tmp); 4348 #ifdef _LP64 4349 if (UseCompressedClassPointers) { 4350 encode_klass_not_null(src, tmp); 4351 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 4352 } else 4353 #endif 4354 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 4355 } 4356 4357 void MacroAssembler::access_load_at(BasicType type, DecoratorSet decorators, Register dst, Address src, 4358 Register tmp1, Register thread_tmp) { 4359 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4360 decorators = AccessInternal::decorator_fixup(decorators); 4361 bool as_raw = (decorators & AS_RAW) != 0; 4362 if (as_raw) { 4363 bs->BarrierSetAssembler::load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4364 } else { 4365 bs->load_at(this, decorators, type, dst, src, tmp1, thread_tmp); 4366 } 4367 } 4368 4369 void MacroAssembler::access_store_at(BasicType type, DecoratorSet decorators, Address dst, Register src, 4370 Register tmp1, Register tmp2) { 4371 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4372 decorators = AccessInternal::decorator_fixup(decorators); 4373 bool as_raw = (decorators & AS_RAW) != 0; 4374 if (as_raw) { 4375 bs->BarrierSetAssembler::store_at(this, decorators, type, dst, src, tmp1, tmp2); 4376 } else { 4377 bs->store_at(this, decorators, type, dst, src, tmp1, tmp2); 4378 } 4379 } 4380 4381 void MacroAssembler::resolve(DecoratorSet decorators, Register obj) { 4382 // Use stronger ACCESS_WRITE|ACCESS_READ by default. 4383 if ((decorators & (ACCESS_READ | ACCESS_WRITE)) == 0) { 4384 decorators |= ACCESS_READ | ACCESS_WRITE; 4385 } 4386 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4387 return bs->resolve(this, decorators, obj); 4388 } 4389 4390 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 4391 Register thread_tmp, DecoratorSet decorators) { 4392 access_load_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 4393 } 4394 4395 // Doesn't do verfication, generates fixed size code 4396 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 4397 Register thread_tmp, DecoratorSet decorators) { 4398 access_load_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 4399 } 4400 4401 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 4402 Register tmp2, DecoratorSet decorators) { 4403 access_store_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, tmp2); 4404 } 4405 4406 // Used for storing NULLs. 4407 void MacroAssembler::store_heap_oop_null(Address dst) { 4408 access_store_at(T_OBJECT, IN_HEAP, dst, noreg, noreg, noreg); 4409 } 4410 4411 #ifdef _LP64 4412 void MacroAssembler::store_klass_gap(Register dst, Register src) { 4413 if (UseCompressedClassPointers) { 4414 // Store to klass gap in destination 4415 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 4416 } 4417 } 4418 4419 #ifdef ASSERT 4420 void MacroAssembler::verify_heapbase(const char* msg) { 4421 assert (UseCompressedOops, "should be compressed"); 4422 assert (Universe::heap() != NULL, "java heap should be initialized"); 4423 if (CheckCompressedOops) { 4424 Label ok; 4425 push(rscratch1); // cmpptr trashes rscratch1 4426 cmpptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 4427 jcc(Assembler::equal, ok); 4428 STOP(msg); 4429 bind(ok); 4430 pop(rscratch1); 4431 } 4432 } 4433 #endif 4434 4435 // Algorithm must match oop.inline.hpp encode_heap_oop. 4436 void MacroAssembler::encode_heap_oop(Register r) { 4437 #ifdef ASSERT 4438 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 4439 #endif 4440 verify_oop_msg(r, "broken oop in encode_heap_oop"); 4441 if (CompressedOops::base() == NULL) { 4442 if (CompressedOops::shift() != 0) { 4443 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4444 shrq(r, LogMinObjAlignmentInBytes); 4445 } 4446 return; 4447 } 4448 testq(r, r); 4449 cmovq(Assembler::equal, r, r12_heapbase); 4450 subq(r, r12_heapbase); 4451 shrq(r, LogMinObjAlignmentInBytes); 4452 } 4453 4454 void MacroAssembler::encode_heap_oop_not_null(Register r) { 4455 #ifdef ASSERT 4456 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 4457 if (CheckCompressedOops) { 4458 Label ok; 4459 testq(r, r); 4460 jcc(Assembler::notEqual, ok); 4461 STOP("null oop passed to encode_heap_oop_not_null"); 4462 bind(ok); 4463 } 4464 #endif 4465 verify_oop_msg(r, "broken oop in encode_heap_oop_not_null"); 4466 if (CompressedOops::base() != NULL) { 4467 subq(r, r12_heapbase); 4468 } 4469 if (CompressedOops::shift() != 0) { 4470 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4471 shrq(r, LogMinObjAlignmentInBytes); 4472 } 4473 } 4474 4475 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 4476 #ifdef ASSERT 4477 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 4478 if (CheckCompressedOops) { 4479 Label ok; 4480 testq(src, src); 4481 jcc(Assembler::notEqual, ok); 4482 STOP("null oop passed to encode_heap_oop_not_null2"); 4483 bind(ok); 4484 } 4485 #endif 4486 verify_oop_msg(src, "broken oop in encode_heap_oop_not_null2"); 4487 if (dst != src) { 4488 movq(dst, src); 4489 } 4490 if (CompressedOops::base() != NULL) { 4491 subq(dst, r12_heapbase); 4492 } 4493 if (CompressedOops::shift() != 0) { 4494 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4495 shrq(dst, LogMinObjAlignmentInBytes); 4496 } 4497 } 4498 4499 void MacroAssembler::decode_heap_oop(Register r) { 4500 #ifdef ASSERT 4501 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 4502 #endif 4503 if (CompressedOops::base() == NULL) { 4504 if (CompressedOops::shift() != 0) { 4505 assert (LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4506 shlq(r, LogMinObjAlignmentInBytes); 4507 } 4508 } else { 4509 Label done; 4510 shlq(r, LogMinObjAlignmentInBytes); 4511 jccb(Assembler::equal, done); 4512 addq(r, r12_heapbase); 4513 bind(done); 4514 } 4515 verify_oop_msg(r, "broken oop in decode_heap_oop"); 4516 } 4517 4518 void MacroAssembler::decode_heap_oop_not_null(Register r) { 4519 // Note: it will change flags 4520 assert (UseCompressedOops, "should only be used for compressed headers"); 4521 assert (Universe::heap() != NULL, "java heap should be initialized"); 4522 // Cannot assert, unverified entry point counts instructions (see .ad file) 4523 // vtableStubs also counts instructions in pd_code_size_limit. 4524 // Also do not verify_oop as this is called by verify_oop. 4525 if (CompressedOops::shift() != 0) { 4526 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4527 shlq(r, LogMinObjAlignmentInBytes); 4528 if (CompressedOops::base() != NULL) { 4529 addq(r, r12_heapbase); 4530 } 4531 } else { 4532 assert (CompressedOops::base() == NULL, "sanity"); 4533 } 4534 } 4535 4536 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 4537 // Note: it will change flags 4538 assert (UseCompressedOops, "should only be used for compressed headers"); 4539 assert (Universe::heap() != NULL, "java heap should be initialized"); 4540 // Cannot assert, unverified entry point counts instructions (see .ad file) 4541 // vtableStubs also counts instructions in pd_code_size_limit. 4542 // Also do not verify_oop as this is called by verify_oop. 4543 if (CompressedOops::shift() != 0) { 4544 assert(LogMinObjAlignmentInBytes == CompressedOops::shift(), "decode alg wrong"); 4545 if (LogMinObjAlignmentInBytes == Address::times_8) { 4546 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 4547 } else { 4548 if (dst != src) { 4549 movq(dst, src); 4550 } 4551 shlq(dst, LogMinObjAlignmentInBytes); 4552 if (CompressedOops::base() != NULL) { 4553 addq(dst, r12_heapbase); 4554 } 4555 } 4556 } else { 4557 assert (CompressedOops::base() == NULL, "sanity"); 4558 if (dst != src) { 4559 movq(dst, src); 4560 } 4561 } 4562 } 4563 4564 void MacroAssembler::encode_klass_not_null(Register r, Register tmp) { 4565 assert_different_registers(r, tmp); 4566 if (CompressedKlassPointers::base() != NULL) { 4567 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 4568 subq(r, tmp); 4569 } 4570 if (CompressedKlassPointers::shift() != 0) { 4571 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4572 shrq(r, LogKlassAlignmentInBytes); 4573 } 4574 } 4575 4576 void MacroAssembler::encode_and_move_klass_not_null(Register dst, Register src) { 4577 assert_different_registers(src, dst); 4578 if (CompressedKlassPointers::base() != NULL) { 4579 mov64(dst, -(int64_t)CompressedKlassPointers::base()); 4580 addq(dst, src); 4581 } else { 4582 movptr(dst, src); 4583 } 4584 if (CompressedKlassPointers::shift() != 0) { 4585 assert (LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4586 shrq(dst, LogKlassAlignmentInBytes); 4587 } 4588 } 4589 4590 // !!! If the instructions that get generated here change then function 4591 // instr_size_for_decode_klass_not_null() needs to get updated. 4592 void MacroAssembler::decode_klass_not_null(Register r, Register tmp) { 4593 assert_different_registers(r, tmp); 4594 // Note: it will change flags 4595 assert(UseCompressedClassPointers, "should only be used for compressed headers"); 4596 // Cannot assert, unverified entry point counts instructions (see .ad file) 4597 // vtableStubs also counts instructions in pd_code_size_limit. 4598 // Also do not verify_oop as this is called by verify_oop. 4599 if (CompressedKlassPointers::shift() != 0) { 4600 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4601 shlq(r, LogKlassAlignmentInBytes); 4602 } 4603 if (CompressedKlassPointers::base() != NULL) { 4604 mov64(tmp, (int64_t)CompressedKlassPointers::base()); 4605 addq(r, tmp); 4606 } 4607 } 4608 4609 void MacroAssembler::decode_and_move_klass_not_null(Register dst, Register src) { 4610 assert_different_registers(src, dst); 4611 // Note: it will change flags 4612 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4613 // Cannot assert, unverified entry point counts instructions (see .ad file) 4614 // vtableStubs also counts instructions in pd_code_size_limit. 4615 // Also do not verify_oop as this is called by verify_oop. 4616 4617 if (CompressedKlassPointers::base() == NULL && 4618 CompressedKlassPointers::shift() == 0) { 4619 // The best case scenario is that there is no base or shift. Then it is already 4620 // a pointer that needs nothing but a register rename. 4621 movl(dst, src); 4622 } else { 4623 if (CompressedKlassPointers::base() != NULL) { 4624 mov64(dst, (int64_t)CompressedKlassPointers::base()); 4625 } else { 4626 xorq(dst, dst); 4627 } 4628 if (CompressedKlassPointers::shift() != 0) { 4629 assert(LogKlassAlignmentInBytes == CompressedKlassPointers::shift(), "decode alg wrong"); 4630 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?"); 4631 leaq(dst, Address(dst, src, Address::times_8, 0)); 4632 } else { 4633 addq(dst, src); 4634 } 4635 } 4636 } 4637 4638 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 4639 assert (UseCompressedOops, "should only be used for compressed headers"); 4640 assert (Universe::heap() != NULL, "java heap should be initialized"); 4641 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4642 int oop_index = oop_recorder()->find_index(obj); 4643 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4644 mov_narrow_oop(dst, oop_index, rspec); 4645 } 4646 4647 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 4648 assert (UseCompressedOops, "should only be used for compressed headers"); 4649 assert (Universe::heap() != NULL, "java heap should be initialized"); 4650 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4651 int oop_index = oop_recorder()->find_index(obj); 4652 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4653 mov_narrow_oop(dst, oop_index, rspec); 4654 } 4655 4656 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 4657 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4658 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4659 int klass_index = oop_recorder()->find_index(k); 4660 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 4661 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 4662 } 4663 4664 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 4665 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4666 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4667 int klass_index = oop_recorder()->find_index(k); 4668 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 4669 mov_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 4670 } 4671 4672 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 4673 assert (UseCompressedOops, "should only be used for compressed headers"); 4674 assert (Universe::heap() != NULL, "java heap should be initialized"); 4675 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4676 int oop_index = oop_recorder()->find_index(obj); 4677 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4678 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 4679 } 4680 4681 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 4682 assert (UseCompressedOops, "should only be used for compressed headers"); 4683 assert (Universe::heap() != NULL, "java heap should be initialized"); 4684 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4685 int oop_index = oop_recorder()->find_index(obj); 4686 RelocationHolder rspec = oop_Relocation::spec(oop_index); 4687 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 4688 } 4689 4690 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 4691 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4692 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4693 int klass_index = oop_recorder()->find_index(k); 4694 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 4695 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 4696 } 4697 4698 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 4699 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 4700 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 4701 int klass_index = oop_recorder()->find_index(k); 4702 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 4703 Assembler::cmp_narrow_oop(dst, CompressedKlassPointers::encode(k), rspec); 4704 } 4705 4706 void MacroAssembler::reinit_heapbase() { 4707 if (UseCompressedOops) { 4708 if (Universe::heap() != NULL) { 4709 if (CompressedOops::base() == NULL) { 4710 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 4711 } else { 4712 mov64(r12_heapbase, (int64_t)CompressedOops::ptrs_base()); 4713 } 4714 } else { 4715 movptr(r12_heapbase, ExternalAddress((address)CompressedOops::ptrs_base_addr())); 4716 } 4717 } 4718 } 4719 4720 #endif // _LP64 4721 4722 // C2 compiled method's prolog code. 4723 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b, bool is_stub) { 4724 4725 // WARNING: Initial instruction MUST be 5 bytes or longer so that 4726 // NativeJump::patch_verified_entry will be able to patch out the entry 4727 // code safely. The push to verify stack depth is ok at 5 bytes, 4728 // the frame allocation can be either 3 or 6 bytes. So if we don't do 4729 // stack bang then we must use the 6 byte frame allocation even if 4730 // we have no frame. :-( 4731 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 4732 4733 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 4734 // Remove word for return addr 4735 framesize -= wordSize; 4736 stack_bang_size -= wordSize; 4737 4738 // Calls to C2R adapters often do not accept exceptional returns. 4739 // We require that their callers must bang for them. But be careful, because 4740 // some VM calls (such as call site linkage) can use several kilobytes of 4741 // stack. But the stack safety zone should account for that. 4742 // See bugs 4446381, 4468289, 4497237. 4743 if (stack_bang_size > 0) { 4744 generate_stack_overflow_check(stack_bang_size); 4745 4746 // We always push rbp, so that on return to interpreter rbp, will be 4747 // restored correctly and we can correct the stack. 4748 push(rbp); 4749 // Save caller's stack pointer into RBP if the frame pointer is preserved. 4750 if (PreserveFramePointer) { 4751 mov(rbp, rsp); 4752 } 4753 // Remove word for ebp 4754 framesize -= wordSize; 4755 4756 // Create frame 4757 if (framesize) { 4758 subptr(rsp, framesize); 4759 } 4760 } else { 4761 // Create frame (force generation of a 4 byte immediate value) 4762 subptr_imm32(rsp, framesize); 4763 4764 // Save RBP register now. 4765 framesize -= wordSize; 4766 movptr(Address(rsp, framesize), rbp); 4767 // Save caller's stack pointer into RBP if the frame pointer is preserved. 4768 if (PreserveFramePointer) { 4769 movptr(rbp, rsp); 4770 if (framesize > 0) { 4771 addptr(rbp, framesize); 4772 } 4773 } 4774 } 4775 4776 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 4777 framesize -= wordSize; 4778 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 4779 } 4780 4781 #ifndef _LP64 4782 // If method sets FPU control word do it now 4783 if (fp_mode_24b) { 4784 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); 4785 } 4786 if (UseSSE >= 2 && VerifyFPU) { 4787 verify_FPU(0, "FPU stack must be clean on entry"); 4788 } 4789 #endif 4790 4791 #ifdef ASSERT 4792 if (VerifyStackAtCalls) { 4793 Label L; 4794 push(rax); 4795 mov(rax, rsp); 4796 andptr(rax, StackAlignmentInBytes-1); 4797 cmpptr(rax, StackAlignmentInBytes-wordSize); 4798 pop(rax); 4799 jcc(Assembler::equal, L); 4800 STOP("Stack is not properly aligned!"); 4801 bind(L); 4802 } 4803 #endif 4804 4805 if (!is_stub) { 4806 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 4807 bs->nmethod_entry_barrier(this); 4808 } 4809 } 4810 4811 // clear memory of size 'cnt' qwords, starting at 'base' using XMM/YMM registers 4812 void MacroAssembler::xmm_clear_mem(Register base, Register cnt, XMMRegister xtmp) { 4813 // cnt - number of qwords (8-byte words). 4814 // base - start address, qword aligned. 4815 Label L_zero_64_bytes, L_loop, L_sloop, L_tail, L_end; 4816 if (UseAVX >= 2) { 4817 vpxor(xtmp, xtmp, xtmp, AVX_256bit); 4818 } else { 4819 pxor(xtmp, xtmp); 4820 } 4821 jmp(L_zero_64_bytes); 4822 4823 BIND(L_loop); 4824 if (UseAVX >= 2) { 4825 vmovdqu(Address(base, 0), xtmp); 4826 vmovdqu(Address(base, 32), xtmp); 4827 } else { 4828 movdqu(Address(base, 0), xtmp); 4829 movdqu(Address(base, 16), xtmp); 4830 movdqu(Address(base, 32), xtmp); 4831 movdqu(Address(base, 48), xtmp); 4832 } 4833 addptr(base, 64); 4834 4835 BIND(L_zero_64_bytes); 4836 subptr(cnt, 8); 4837 jccb(Assembler::greaterEqual, L_loop); 4838 addptr(cnt, 4); 4839 jccb(Assembler::less, L_tail); 4840 // Copy trailing 32 bytes 4841 if (UseAVX >= 2) { 4842 vmovdqu(Address(base, 0), xtmp); 4843 } else { 4844 movdqu(Address(base, 0), xtmp); 4845 movdqu(Address(base, 16), xtmp); 4846 } 4847 addptr(base, 32); 4848 subptr(cnt, 4); 4849 4850 BIND(L_tail); 4851 addptr(cnt, 4); 4852 jccb(Assembler::lessEqual, L_end); 4853 decrement(cnt); 4854 4855 BIND(L_sloop); 4856 movq(Address(base, 0), xtmp); 4857 addptr(base, 8); 4858 decrement(cnt); 4859 jccb(Assembler::greaterEqual, L_sloop); 4860 BIND(L_end); 4861 } 4862 4863 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp, XMMRegister xtmp, bool is_large) { 4864 // cnt - number of qwords (8-byte words). 4865 // base - start address, qword aligned. 4866 // is_large - if optimizers know cnt is larger than InitArrayShortSize 4867 assert(base==rdi, "base register must be edi for rep stos"); 4868 assert(tmp==rax, "tmp register must be eax for rep stos"); 4869 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 4870 assert(InitArrayShortSize % BytesPerLong == 0, 4871 "InitArrayShortSize should be the multiple of BytesPerLong"); 4872 4873 Label DONE; 4874 4875 if (!is_large || !UseXMMForObjInit) { 4876 xorptr(tmp, tmp); 4877 } 4878 4879 if (!is_large) { 4880 Label LOOP, LONG; 4881 cmpptr(cnt, InitArrayShortSize/BytesPerLong); 4882 jccb(Assembler::greater, LONG); 4883 4884 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 4885 4886 decrement(cnt); 4887 jccb(Assembler::negative, DONE); // Zero length 4888 4889 // Use individual pointer-sized stores for small counts: 4890 BIND(LOOP); 4891 movptr(Address(base, cnt, Address::times_ptr), tmp); 4892 decrement(cnt); 4893 jccb(Assembler::greaterEqual, LOOP); 4894 jmpb(DONE); 4895 4896 BIND(LONG); 4897 } 4898 4899 // Use longer rep-prefixed ops for non-small counts: 4900 if (UseFastStosb) { 4901 shlptr(cnt, 3); // convert to number of bytes 4902 rep_stosb(); 4903 } else if (UseXMMForObjInit) { 4904 movptr(tmp, base); 4905 xmm_clear_mem(tmp, cnt, xtmp); 4906 } else { 4907 NOT_LP64(shlptr(cnt, 1);) // convert to number of 32-bit words for 32-bit VM 4908 rep_stos(); 4909 } 4910 4911 BIND(DONE); 4912 } 4913 4914 void MacroAssembler::generate_fill(BasicType t, bool aligned, 4915 Register to, Register value, Register count, 4916 Register rtmp, XMMRegister xtmp) { 4917 ShortBranchVerifier sbv(this); 4918 assert_different_registers(to, value, count, rtmp); 4919 Label L_exit; 4920 Label L_fill_2_bytes, L_fill_4_bytes; 4921 4922 int shift = -1; 4923 switch (t) { 4924 case T_BYTE: 4925 shift = 2; 4926 break; 4927 case T_SHORT: 4928 shift = 1; 4929 break; 4930 case T_INT: 4931 shift = 0; 4932 break; 4933 default: ShouldNotReachHere(); 4934 } 4935 4936 if (t == T_BYTE) { 4937 andl(value, 0xff); 4938 movl(rtmp, value); 4939 shll(rtmp, 8); 4940 orl(value, rtmp); 4941 } 4942 if (t == T_SHORT) { 4943 andl(value, 0xffff); 4944 } 4945 if (t == T_BYTE || t == T_SHORT) { 4946 movl(rtmp, value); 4947 shll(rtmp, 16); 4948 orl(value, rtmp); 4949 } 4950 4951 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 4952 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 4953 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 4954 Label L_skip_align2; 4955 // align source address at 4 bytes address boundary 4956 if (t == T_BYTE) { 4957 Label L_skip_align1; 4958 // One byte misalignment happens only for byte arrays 4959 testptr(to, 1); 4960 jccb(Assembler::zero, L_skip_align1); 4961 movb(Address(to, 0), value); 4962 increment(to); 4963 decrement(count); 4964 BIND(L_skip_align1); 4965 } 4966 // Two bytes misalignment happens only for byte and short (char) arrays 4967 testptr(to, 2); 4968 jccb(Assembler::zero, L_skip_align2); 4969 movw(Address(to, 0), value); 4970 addptr(to, 2); 4971 subl(count, 1<<(shift-1)); 4972 BIND(L_skip_align2); 4973 } 4974 if (UseSSE < 2) { 4975 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 4976 // Fill 32-byte chunks 4977 subl(count, 8 << shift); 4978 jcc(Assembler::less, L_check_fill_8_bytes); 4979 align(16); 4980 4981 BIND(L_fill_32_bytes_loop); 4982 4983 for (int i = 0; i < 32; i += 4) { 4984 movl(Address(to, i), value); 4985 } 4986 4987 addptr(to, 32); 4988 subl(count, 8 << shift); 4989 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 4990 BIND(L_check_fill_8_bytes); 4991 addl(count, 8 << shift); 4992 jccb(Assembler::zero, L_exit); 4993 jmpb(L_fill_8_bytes); 4994 4995 // 4996 // length is too short, just fill qwords 4997 // 4998 BIND(L_fill_8_bytes_loop); 4999 movl(Address(to, 0), value); 5000 movl(Address(to, 4), value); 5001 addptr(to, 8); 5002 BIND(L_fill_8_bytes); 5003 subl(count, 1 << (shift + 1)); 5004 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 5005 // fall through to fill 4 bytes 5006 } else { 5007 Label L_fill_32_bytes; 5008 if (!UseUnalignedLoadStores) { 5009 // align to 8 bytes, we know we are 4 byte aligned to start 5010 testptr(to, 4); 5011 jccb(Assembler::zero, L_fill_32_bytes); 5012 movl(Address(to, 0), value); 5013 addptr(to, 4); 5014 subl(count, 1<<shift); 5015 } 5016 BIND(L_fill_32_bytes); 5017 { 5018 assert( UseSSE >= 2, "supported cpu only" ); 5019 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 5020 movdl(xtmp, value); 5021 if (UseAVX >= 2 && UseUnalignedLoadStores) { 5022 Label L_check_fill_32_bytes; 5023 if (UseAVX > 2) { 5024 // Fill 64-byte chunks 5025 Label L_fill_64_bytes_loop_avx3, L_check_fill_64_bytes_avx2; 5026 5027 // If number of bytes to fill < AVX3Threshold, perform fill using AVX2 5028 cmpl(count, AVX3Threshold); 5029 jccb(Assembler::below, L_check_fill_64_bytes_avx2); 5030 5031 vpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 5032 5033 subl(count, 16 << shift); 5034 jccb(Assembler::less, L_check_fill_32_bytes); 5035 align(16); 5036 5037 BIND(L_fill_64_bytes_loop_avx3); 5038 evmovdqul(Address(to, 0), xtmp, Assembler::AVX_512bit); 5039 addptr(to, 64); 5040 subl(count, 16 << shift); 5041 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop_avx3); 5042 jmpb(L_check_fill_32_bytes); 5043 5044 BIND(L_check_fill_64_bytes_avx2); 5045 } 5046 // Fill 64-byte chunks 5047 Label L_fill_64_bytes_loop; 5048 vpbroadcastd(xtmp, xtmp, Assembler::AVX_256bit); 5049 5050 subl(count, 16 << shift); 5051 jcc(Assembler::less, L_check_fill_32_bytes); 5052 align(16); 5053 5054 BIND(L_fill_64_bytes_loop); 5055 vmovdqu(Address(to, 0), xtmp); 5056 vmovdqu(Address(to, 32), xtmp); 5057 addptr(to, 64); 5058 subl(count, 16 << shift); 5059 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 5060 5061 BIND(L_check_fill_32_bytes); 5062 addl(count, 8 << shift); 5063 jccb(Assembler::less, L_check_fill_8_bytes); 5064 vmovdqu(Address(to, 0), xtmp); 5065 addptr(to, 32); 5066 subl(count, 8 << shift); 5067 5068 BIND(L_check_fill_8_bytes); 5069 // clean upper bits of YMM registers 5070 movdl(xtmp, value); 5071 pshufd(xtmp, xtmp, 0); 5072 } else { 5073 // Fill 32-byte chunks 5074 pshufd(xtmp, xtmp, 0); 5075 5076 subl(count, 8 << shift); 5077 jcc(Assembler::less, L_check_fill_8_bytes); 5078 align(16); 5079 5080 BIND(L_fill_32_bytes_loop); 5081 5082 if (UseUnalignedLoadStores) { 5083 movdqu(Address(to, 0), xtmp); 5084 movdqu(Address(to, 16), xtmp); 5085 } else { 5086 movq(Address(to, 0), xtmp); 5087 movq(Address(to, 8), xtmp); 5088 movq(Address(to, 16), xtmp); 5089 movq(Address(to, 24), xtmp); 5090 } 5091 5092 addptr(to, 32); 5093 subl(count, 8 << shift); 5094 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 5095 5096 BIND(L_check_fill_8_bytes); 5097 } 5098 addl(count, 8 << shift); 5099 jccb(Assembler::zero, L_exit); 5100 jmpb(L_fill_8_bytes); 5101 5102 // 5103 // length is too short, just fill qwords 5104 // 5105 BIND(L_fill_8_bytes_loop); 5106 movq(Address(to, 0), xtmp); 5107 addptr(to, 8); 5108 BIND(L_fill_8_bytes); 5109 subl(count, 1 << (shift + 1)); 5110 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 5111 } 5112 } 5113 // fill trailing 4 bytes 5114 BIND(L_fill_4_bytes); 5115 testl(count, 1<<shift); 5116 jccb(Assembler::zero, L_fill_2_bytes); 5117 movl(Address(to, 0), value); 5118 if (t == T_BYTE || t == T_SHORT) { 5119 Label L_fill_byte; 5120 addptr(to, 4); 5121 BIND(L_fill_2_bytes); 5122 // fill trailing 2 bytes 5123 testl(count, 1<<(shift-1)); 5124 jccb(Assembler::zero, L_fill_byte); 5125 movw(Address(to, 0), value); 5126 if (t == T_BYTE) { 5127 addptr(to, 2); 5128 BIND(L_fill_byte); 5129 // fill trailing byte 5130 testl(count, 1); 5131 jccb(Assembler::zero, L_exit); 5132 movb(Address(to, 0), value); 5133 } else { 5134 BIND(L_fill_byte); 5135 } 5136 } else { 5137 BIND(L_fill_2_bytes); 5138 } 5139 BIND(L_exit); 5140 } 5141 5142 // encode char[] to byte[] in ISO_8859_1 5143 //@HotSpotIntrinsicCandidate 5144 //private static int implEncodeISOArray(byte[] sa, int sp, 5145 //byte[] da, int dp, int len) { 5146 // int i = 0; 5147 // for (; i < len; i++) { 5148 // char c = StringUTF16.getChar(sa, sp++); 5149 // if (c > '\u00FF') 5150 // break; 5151 // da[dp++] = (byte)c; 5152 // } 5153 // return i; 5154 //} 5155 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 5156 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 5157 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 5158 Register tmp5, Register result) { 5159 5160 // rsi: src 5161 // rdi: dst 5162 // rdx: len 5163 // rcx: tmp5 5164 // rax: result 5165 ShortBranchVerifier sbv(this); 5166 assert_different_registers(src, dst, len, tmp5, result); 5167 Label L_done, L_copy_1_char, L_copy_1_char_exit; 5168 5169 // set result 5170 xorl(result, result); 5171 // check for zero length 5172 testl(len, len); 5173 jcc(Assembler::zero, L_done); 5174 5175 movl(result, len); 5176 5177 // Setup pointers 5178 lea(src, Address(src, len, Address::times_2)); // char[] 5179 lea(dst, Address(dst, len, Address::times_1)); // byte[] 5180 negptr(len); 5181 5182 if (UseSSE42Intrinsics || UseAVX >= 2) { 5183 Label L_copy_8_chars, L_copy_8_chars_exit; 5184 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 5185 5186 if (UseAVX >= 2) { 5187 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 5188 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector 5189 movdl(tmp1Reg, tmp5); 5190 vpbroadcastd(tmp1Reg, tmp1Reg, Assembler::AVX_256bit); 5191 jmp(L_chars_32_check); 5192 5193 bind(L_copy_32_chars); 5194 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 5195 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 5196 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 5197 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 5198 jccb(Assembler::notZero, L_copy_32_chars_exit); 5199 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 5200 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 5201 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 5202 5203 bind(L_chars_32_check); 5204 addptr(len, 32); 5205 jcc(Assembler::lessEqual, L_copy_32_chars); 5206 5207 bind(L_copy_32_chars_exit); 5208 subptr(len, 16); 5209 jccb(Assembler::greater, L_copy_16_chars_exit); 5210 5211 } else if (UseSSE42Intrinsics) { 5212 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector 5213 movdl(tmp1Reg, tmp5); 5214 pshufd(tmp1Reg, tmp1Reg, 0); 5215 jmpb(L_chars_16_check); 5216 } 5217 5218 bind(L_copy_16_chars); 5219 if (UseAVX >= 2) { 5220 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 5221 vptest(tmp2Reg, tmp1Reg); 5222 jcc(Assembler::notZero, L_copy_16_chars_exit); 5223 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 5224 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 5225 } else { 5226 if (UseAVX > 0) { 5227 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 5228 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 5229 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 5230 } else { 5231 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 5232 por(tmp2Reg, tmp3Reg); 5233 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 5234 por(tmp2Reg, tmp4Reg); 5235 } 5236 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 5237 jccb(Assembler::notZero, L_copy_16_chars_exit); 5238 packuswb(tmp3Reg, tmp4Reg); 5239 } 5240 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 5241 5242 bind(L_chars_16_check); 5243 addptr(len, 16); 5244 jcc(Assembler::lessEqual, L_copy_16_chars); 5245 5246 bind(L_copy_16_chars_exit); 5247 if (UseAVX >= 2) { 5248 // clean upper bits of YMM registers 5249 vpxor(tmp2Reg, tmp2Reg); 5250 vpxor(tmp3Reg, tmp3Reg); 5251 vpxor(tmp4Reg, tmp4Reg); 5252 movdl(tmp1Reg, tmp5); 5253 pshufd(tmp1Reg, tmp1Reg, 0); 5254 } 5255 subptr(len, 8); 5256 jccb(Assembler::greater, L_copy_8_chars_exit); 5257 5258 bind(L_copy_8_chars); 5259 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 5260 ptest(tmp3Reg, tmp1Reg); 5261 jccb(Assembler::notZero, L_copy_8_chars_exit); 5262 packuswb(tmp3Reg, tmp1Reg); 5263 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 5264 addptr(len, 8); 5265 jccb(Assembler::lessEqual, L_copy_8_chars); 5266 5267 bind(L_copy_8_chars_exit); 5268 subptr(len, 8); 5269 jccb(Assembler::zero, L_done); 5270 } 5271 5272 bind(L_copy_1_char); 5273 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 5274 testl(tmp5, 0xff00); // check if Unicode char 5275 jccb(Assembler::notZero, L_copy_1_char_exit); 5276 movb(Address(dst, len, Address::times_1, 0), tmp5); 5277 addptr(len, 1); 5278 jccb(Assembler::less, L_copy_1_char); 5279 5280 bind(L_copy_1_char_exit); 5281 addptr(result, len); // len is negative count of not processed elements 5282 5283 bind(L_done); 5284 } 5285 5286 #ifdef _LP64 5287 /** 5288 * Helper for multiply_to_len(). 5289 */ 5290 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 5291 addq(dest_lo, src1); 5292 adcq(dest_hi, 0); 5293 addq(dest_lo, src2); 5294 adcq(dest_hi, 0); 5295 } 5296 5297 /** 5298 * Multiply 64 bit by 64 bit first loop. 5299 */ 5300 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 5301 Register y, Register y_idx, Register z, 5302 Register carry, Register product, 5303 Register idx, Register kdx) { 5304 // 5305 // jlong carry, x[], y[], z[]; 5306 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5307 // huge_128 product = y[idx] * x[xstart] + carry; 5308 // z[kdx] = (jlong)product; 5309 // carry = (jlong)(product >>> 64); 5310 // } 5311 // z[xstart] = carry; 5312 // 5313 5314 Label L_first_loop, L_first_loop_exit; 5315 Label L_one_x, L_one_y, L_multiply; 5316 5317 decrementl(xstart); 5318 jcc(Assembler::negative, L_one_x); 5319 5320 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 5321 rorq(x_xstart, 32); // convert big-endian to little-endian 5322 5323 bind(L_first_loop); 5324 decrementl(idx); 5325 jcc(Assembler::negative, L_first_loop_exit); 5326 decrementl(idx); 5327 jcc(Assembler::negative, L_one_y); 5328 movq(y_idx, Address(y, idx, Address::times_4, 0)); 5329 rorq(y_idx, 32); // convert big-endian to little-endian 5330 bind(L_multiply); 5331 movq(product, x_xstart); 5332 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 5333 addq(product, carry); 5334 adcq(rdx, 0); 5335 subl(kdx, 2); 5336 movl(Address(z, kdx, Address::times_4, 4), product); 5337 shrq(product, 32); 5338 movl(Address(z, kdx, Address::times_4, 0), product); 5339 movq(carry, rdx); 5340 jmp(L_first_loop); 5341 5342 bind(L_one_y); 5343 movl(y_idx, Address(y, 0)); 5344 jmp(L_multiply); 5345 5346 bind(L_one_x); 5347 movl(x_xstart, Address(x, 0)); 5348 jmp(L_first_loop); 5349 5350 bind(L_first_loop_exit); 5351 } 5352 5353 /** 5354 * Multiply 64 bit by 64 bit and add 128 bit. 5355 */ 5356 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 5357 Register yz_idx, Register idx, 5358 Register carry, Register product, int offset) { 5359 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 5360 // z[kdx] = (jlong)product; 5361 5362 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 5363 rorq(yz_idx, 32); // convert big-endian to little-endian 5364 movq(product, x_xstart); 5365 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 5366 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 5367 rorq(yz_idx, 32); // convert big-endian to little-endian 5368 5369 add2_with_carry(rdx, product, carry, yz_idx); 5370 5371 movl(Address(z, idx, Address::times_4, offset+4), product); 5372 shrq(product, 32); 5373 movl(Address(z, idx, Address::times_4, offset), product); 5374 5375 } 5376 5377 /** 5378 * Multiply 128 bit by 128 bit. Unrolled inner loop. 5379 */ 5380 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 5381 Register yz_idx, Register idx, Register jdx, 5382 Register carry, Register product, 5383 Register carry2) { 5384 // jlong carry, x[], y[], z[]; 5385 // int kdx = ystart+1; 5386 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 5387 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 5388 // z[kdx+idx+1] = (jlong)product; 5389 // jlong carry2 = (jlong)(product >>> 64); 5390 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 5391 // z[kdx+idx] = (jlong)product; 5392 // carry = (jlong)(product >>> 64); 5393 // } 5394 // idx += 2; 5395 // if (idx > 0) { 5396 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 5397 // z[kdx+idx] = (jlong)product; 5398 // carry = (jlong)(product >>> 64); 5399 // } 5400 // 5401 5402 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 5403 5404 movl(jdx, idx); 5405 andl(jdx, 0xFFFFFFFC); 5406 shrl(jdx, 2); 5407 5408 bind(L_third_loop); 5409 subl(jdx, 1); 5410 jcc(Assembler::negative, L_third_loop_exit); 5411 subl(idx, 4); 5412 5413 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 5414 movq(carry2, rdx); 5415 5416 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 5417 movq(carry, rdx); 5418 jmp(L_third_loop); 5419 5420 bind (L_third_loop_exit); 5421 5422 andl (idx, 0x3); 5423 jcc(Assembler::zero, L_post_third_loop_done); 5424 5425 Label L_check_1; 5426 subl(idx, 2); 5427 jcc(Assembler::negative, L_check_1); 5428 5429 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 5430 movq(carry, rdx); 5431 5432 bind (L_check_1); 5433 addl (idx, 0x2); 5434 andl (idx, 0x1); 5435 subl(idx, 1); 5436 jcc(Assembler::negative, L_post_third_loop_done); 5437 5438 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 5439 movq(product, x_xstart); 5440 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 5441 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 5442 5443 add2_with_carry(rdx, product, yz_idx, carry); 5444 5445 movl(Address(z, idx, Address::times_4, 0), product); 5446 shrq(product, 32); 5447 5448 shlq(rdx, 32); 5449 orq(product, rdx); 5450 movq(carry, product); 5451 5452 bind(L_post_third_loop_done); 5453 } 5454 5455 /** 5456 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 5457 * 5458 */ 5459 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 5460 Register carry, Register carry2, 5461 Register idx, Register jdx, 5462 Register yz_idx1, Register yz_idx2, 5463 Register tmp, Register tmp3, Register tmp4) { 5464 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 5465 5466 // jlong carry, x[], y[], z[]; 5467 // int kdx = ystart+1; 5468 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 5469 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 5470 // jlong carry2 = (jlong)(tmp3 >>> 64); 5471 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 5472 // carry = (jlong)(tmp4 >>> 64); 5473 // z[kdx+idx+1] = (jlong)tmp3; 5474 // z[kdx+idx] = (jlong)tmp4; 5475 // } 5476 // idx += 2; 5477 // if (idx > 0) { 5478 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 5479 // z[kdx+idx] = (jlong)yz_idx1; 5480 // carry = (jlong)(yz_idx1 >>> 64); 5481 // } 5482 // 5483 5484 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 5485 5486 movl(jdx, idx); 5487 andl(jdx, 0xFFFFFFFC); 5488 shrl(jdx, 2); 5489 5490 bind(L_third_loop); 5491 subl(jdx, 1); 5492 jcc(Assembler::negative, L_third_loop_exit); 5493 subl(idx, 4); 5494 5495 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 5496 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 5497 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 5498 rorxq(yz_idx2, yz_idx2, 32); 5499 5500 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 5501 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 5502 5503 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 5504 rorxq(yz_idx1, yz_idx1, 32); 5505 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 5506 rorxq(yz_idx2, yz_idx2, 32); 5507 5508 if (VM_Version::supports_adx()) { 5509 adcxq(tmp3, carry); 5510 adoxq(tmp3, yz_idx1); 5511 5512 adcxq(tmp4, tmp); 5513 adoxq(tmp4, yz_idx2); 5514 5515 movl(carry, 0); // does not affect flags 5516 adcxq(carry2, carry); 5517 adoxq(carry2, carry); 5518 } else { 5519 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 5520 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 5521 } 5522 movq(carry, carry2); 5523 5524 movl(Address(z, idx, Address::times_4, 12), tmp3); 5525 shrq(tmp3, 32); 5526 movl(Address(z, idx, Address::times_4, 8), tmp3); 5527 5528 movl(Address(z, idx, Address::times_4, 4), tmp4); 5529 shrq(tmp4, 32); 5530 movl(Address(z, idx, Address::times_4, 0), tmp4); 5531 5532 jmp(L_third_loop); 5533 5534 bind (L_third_loop_exit); 5535 5536 andl (idx, 0x3); 5537 jcc(Assembler::zero, L_post_third_loop_done); 5538 5539 Label L_check_1; 5540 subl(idx, 2); 5541 jcc(Assembler::negative, L_check_1); 5542 5543 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 5544 rorxq(yz_idx1, yz_idx1, 32); 5545 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 5546 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 5547 rorxq(yz_idx2, yz_idx2, 32); 5548 5549 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 5550 5551 movl(Address(z, idx, Address::times_4, 4), tmp3); 5552 shrq(tmp3, 32); 5553 movl(Address(z, idx, Address::times_4, 0), tmp3); 5554 movq(carry, tmp4); 5555 5556 bind (L_check_1); 5557 addl (idx, 0x2); 5558 andl (idx, 0x1); 5559 subl(idx, 1); 5560 jcc(Assembler::negative, L_post_third_loop_done); 5561 movl(tmp4, Address(y, idx, Address::times_4, 0)); 5562 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 5563 movl(tmp4, Address(z, idx, Address::times_4, 0)); 5564 5565 add2_with_carry(carry2, tmp3, tmp4, carry); 5566 5567 movl(Address(z, idx, Address::times_4, 0), tmp3); 5568 shrq(tmp3, 32); 5569 5570 shlq(carry2, 32); 5571 orq(tmp3, carry2); 5572 movq(carry, tmp3); 5573 5574 bind(L_post_third_loop_done); 5575 } 5576 5577 /** 5578 * Code for BigInteger::multiplyToLen() instrinsic. 5579 * 5580 * rdi: x 5581 * rax: xlen 5582 * rsi: y 5583 * rcx: ylen 5584 * r8: z 5585 * r11: zlen 5586 * r12: tmp1 5587 * r13: tmp2 5588 * r14: tmp3 5589 * r15: tmp4 5590 * rbx: tmp5 5591 * 5592 */ 5593 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 5594 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 5595 ShortBranchVerifier sbv(this); 5596 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 5597 5598 push(tmp1); 5599 push(tmp2); 5600 push(tmp3); 5601 push(tmp4); 5602 push(tmp5); 5603 5604 push(xlen); 5605 push(zlen); 5606 5607 const Register idx = tmp1; 5608 const Register kdx = tmp2; 5609 const Register xstart = tmp3; 5610 5611 const Register y_idx = tmp4; 5612 const Register carry = tmp5; 5613 const Register product = xlen; 5614 const Register x_xstart = zlen; // reuse register 5615 5616 // First Loop. 5617 // 5618 // final static long LONG_MASK = 0xffffffffL; 5619 // int xstart = xlen - 1; 5620 // int ystart = ylen - 1; 5621 // long carry = 0; 5622 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 5623 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 5624 // z[kdx] = (int)product; 5625 // carry = product >>> 32; 5626 // } 5627 // z[xstart] = (int)carry; 5628 // 5629 5630 movl(idx, ylen); // idx = ylen; 5631 movl(kdx, zlen); // kdx = xlen+ylen; 5632 xorq(carry, carry); // carry = 0; 5633 5634 Label L_done; 5635 5636 movl(xstart, xlen); 5637 decrementl(xstart); 5638 jcc(Assembler::negative, L_done); 5639 5640 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 5641 5642 Label L_second_loop; 5643 testl(kdx, kdx); 5644 jcc(Assembler::zero, L_second_loop); 5645 5646 Label L_carry; 5647 subl(kdx, 1); 5648 jcc(Assembler::zero, L_carry); 5649 5650 movl(Address(z, kdx, Address::times_4, 0), carry); 5651 shrq(carry, 32); 5652 subl(kdx, 1); 5653 5654 bind(L_carry); 5655 movl(Address(z, kdx, Address::times_4, 0), carry); 5656 5657 // Second and third (nested) loops. 5658 // 5659 // for (int i = xstart-1; i >= 0; i--) { // Second loop 5660 // carry = 0; 5661 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 5662 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 5663 // (z[k] & LONG_MASK) + carry; 5664 // z[k] = (int)product; 5665 // carry = product >>> 32; 5666 // } 5667 // z[i] = (int)carry; 5668 // } 5669 // 5670 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 5671 5672 const Register jdx = tmp1; 5673 5674 bind(L_second_loop); 5675 xorl(carry, carry); // carry = 0; 5676 movl(jdx, ylen); // j = ystart+1 5677 5678 subl(xstart, 1); // i = xstart-1; 5679 jcc(Assembler::negative, L_done); 5680 5681 push (z); 5682 5683 Label L_last_x; 5684 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 5685 subl(xstart, 1); // i = xstart-1; 5686 jcc(Assembler::negative, L_last_x); 5687 5688 if (UseBMI2Instructions) { 5689 movq(rdx, Address(x, xstart, Address::times_4, 0)); 5690 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 5691 } else { 5692 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 5693 rorq(x_xstart, 32); // convert big-endian to little-endian 5694 } 5695 5696 Label L_third_loop_prologue; 5697 bind(L_third_loop_prologue); 5698 5699 push (x); 5700 push (xstart); 5701 push (ylen); 5702 5703 5704 if (UseBMI2Instructions) { 5705 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 5706 } else { // !UseBMI2Instructions 5707 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 5708 } 5709 5710 pop(ylen); 5711 pop(xlen); 5712 pop(x); 5713 pop(z); 5714 5715 movl(tmp3, xlen); 5716 addl(tmp3, 1); 5717 movl(Address(z, tmp3, Address::times_4, 0), carry); 5718 subl(tmp3, 1); 5719 jccb(Assembler::negative, L_done); 5720 5721 shrq(carry, 32); 5722 movl(Address(z, tmp3, Address::times_4, 0), carry); 5723 jmp(L_second_loop); 5724 5725 // Next infrequent code is moved outside loops. 5726 bind(L_last_x); 5727 if (UseBMI2Instructions) { 5728 movl(rdx, Address(x, 0)); 5729 } else { 5730 movl(x_xstart, Address(x, 0)); 5731 } 5732 jmp(L_third_loop_prologue); 5733 5734 bind(L_done); 5735 5736 pop(zlen); 5737 pop(xlen); 5738 5739 pop(tmp5); 5740 pop(tmp4); 5741 pop(tmp3); 5742 pop(tmp2); 5743 pop(tmp1); 5744 } 5745 5746 void MacroAssembler::vectorized_mismatch(Register obja, Register objb, Register length, Register log2_array_indxscale, 5747 Register result, Register tmp1, Register tmp2, XMMRegister rymm0, XMMRegister rymm1, XMMRegister rymm2){ 5748 assert(UseSSE42Intrinsics, "SSE4.2 must be enabled."); 5749 Label VECTOR16_LOOP, VECTOR8_LOOP, VECTOR4_LOOP; 5750 Label VECTOR8_TAIL, VECTOR4_TAIL; 5751 Label VECTOR32_NOT_EQUAL, VECTOR16_NOT_EQUAL, VECTOR8_NOT_EQUAL, VECTOR4_NOT_EQUAL; 5752 Label SAME_TILL_END, DONE; 5753 Label BYTES_LOOP, BYTES_TAIL, BYTES_NOT_EQUAL; 5754 5755 //scale is in rcx in both Win64 and Unix 5756 ShortBranchVerifier sbv(this); 5757 5758 shlq(length); 5759 xorq(result, result); 5760 5761 if ((AVX3Threshold == 0) && (UseAVX > 2) && 5762 VM_Version::supports_avx512vlbw()) { 5763 Label VECTOR64_LOOP, VECTOR64_NOT_EQUAL, VECTOR32_TAIL; 5764 5765 cmpq(length, 64); 5766 jcc(Assembler::less, VECTOR32_TAIL); 5767 5768 movq(tmp1, length); 5769 andq(tmp1, 0x3F); // tail count 5770 andq(length, ~(0x3F)); //vector count 5771 5772 bind(VECTOR64_LOOP); 5773 // AVX512 code to compare 64 byte vectors. 5774 evmovdqub(rymm0, Address(obja, result), Assembler::AVX_512bit); 5775 evpcmpeqb(k7, rymm0, Address(objb, result), Assembler::AVX_512bit); 5776 kortestql(k7, k7); 5777 jcc(Assembler::aboveEqual, VECTOR64_NOT_EQUAL); // mismatch 5778 addq(result, 64); 5779 subq(length, 64); 5780 jccb(Assembler::notZero, VECTOR64_LOOP); 5781 5782 //bind(VECTOR64_TAIL); 5783 testq(tmp1, tmp1); 5784 jcc(Assembler::zero, SAME_TILL_END); 5785 5786 //bind(VECTOR64_TAIL); 5787 // AVX512 code to compare upto 63 byte vectors. 5788 mov64(tmp2, 0xFFFFFFFFFFFFFFFF); 5789 shlxq(tmp2, tmp2, tmp1); 5790 notq(tmp2); 5791 kmovql(k3, tmp2); 5792 5793 evmovdqub(rymm0, k3, Address(obja, result), Assembler::AVX_512bit); 5794 evpcmpeqb(k7, k3, rymm0, Address(objb, result), Assembler::AVX_512bit); 5795 5796 ktestql(k7, k3); 5797 jcc(Assembler::below, SAME_TILL_END); // not mismatch 5798 5799 bind(VECTOR64_NOT_EQUAL); 5800 kmovql(tmp1, k7); 5801 notq(tmp1); 5802 tzcntq(tmp1, tmp1); 5803 addq(result, tmp1); 5804 shrq(result); 5805 jmp(DONE); 5806 bind(VECTOR32_TAIL); 5807 } 5808 5809 cmpq(length, 8); 5810 jcc(Assembler::equal, VECTOR8_LOOP); 5811 jcc(Assembler::less, VECTOR4_TAIL); 5812 5813 if (UseAVX >= 2) { 5814 Label VECTOR16_TAIL, VECTOR32_LOOP; 5815 5816 cmpq(length, 16); 5817 jcc(Assembler::equal, VECTOR16_LOOP); 5818 jcc(Assembler::less, VECTOR8_LOOP); 5819 5820 cmpq(length, 32); 5821 jccb(Assembler::less, VECTOR16_TAIL); 5822 5823 subq(length, 32); 5824 bind(VECTOR32_LOOP); 5825 vmovdqu(rymm0, Address(obja, result)); 5826 vmovdqu(rymm1, Address(objb, result)); 5827 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_256bit); 5828 vptest(rymm2, rymm2); 5829 jcc(Assembler::notZero, VECTOR32_NOT_EQUAL);//mismatch found 5830 addq(result, 32); 5831 subq(length, 32); 5832 jcc(Assembler::greaterEqual, VECTOR32_LOOP); 5833 addq(length, 32); 5834 jcc(Assembler::equal, SAME_TILL_END); 5835 //falling through if less than 32 bytes left //close the branch here. 5836 5837 bind(VECTOR16_TAIL); 5838 cmpq(length, 16); 5839 jccb(Assembler::less, VECTOR8_TAIL); 5840 bind(VECTOR16_LOOP); 5841 movdqu(rymm0, Address(obja, result)); 5842 movdqu(rymm1, Address(objb, result)); 5843 vpxor(rymm2, rymm0, rymm1, Assembler::AVX_128bit); 5844 ptest(rymm2, rymm2); 5845 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 5846 addq(result, 16); 5847 subq(length, 16); 5848 jcc(Assembler::equal, SAME_TILL_END); 5849 //falling through if less than 16 bytes left 5850 } else {//regular intrinsics 5851 5852 cmpq(length, 16); 5853 jccb(Assembler::less, VECTOR8_TAIL); 5854 5855 subq(length, 16); 5856 bind(VECTOR16_LOOP); 5857 movdqu(rymm0, Address(obja, result)); 5858 movdqu(rymm1, Address(objb, result)); 5859 pxor(rymm0, rymm1); 5860 ptest(rymm0, rymm0); 5861 jcc(Assembler::notZero, VECTOR16_NOT_EQUAL);//mismatch found 5862 addq(result, 16); 5863 subq(length, 16); 5864 jccb(Assembler::greaterEqual, VECTOR16_LOOP); 5865 addq(length, 16); 5866 jcc(Assembler::equal, SAME_TILL_END); 5867 //falling through if less than 16 bytes left 5868 } 5869 5870 bind(VECTOR8_TAIL); 5871 cmpq(length, 8); 5872 jccb(Assembler::less, VECTOR4_TAIL); 5873 bind(VECTOR8_LOOP); 5874 movq(tmp1, Address(obja, result)); 5875 movq(tmp2, Address(objb, result)); 5876 xorq(tmp1, tmp2); 5877 testq(tmp1, tmp1); 5878 jcc(Assembler::notZero, VECTOR8_NOT_EQUAL);//mismatch found 5879 addq(result, 8); 5880 subq(length, 8); 5881 jcc(Assembler::equal, SAME_TILL_END); 5882 //falling through if less than 8 bytes left 5883 5884 bind(VECTOR4_TAIL); 5885 cmpq(length, 4); 5886 jccb(Assembler::less, BYTES_TAIL); 5887 bind(VECTOR4_LOOP); 5888 movl(tmp1, Address(obja, result)); 5889 xorl(tmp1, Address(objb, result)); 5890 testl(tmp1, tmp1); 5891 jcc(Assembler::notZero, VECTOR4_NOT_EQUAL);//mismatch found 5892 addq(result, 4); 5893 subq(length, 4); 5894 jcc(Assembler::equal, SAME_TILL_END); 5895 //falling through if less than 4 bytes left 5896 5897 bind(BYTES_TAIL); 5898 bind(BYTES_LOOP); 5899 load_unsigned_byte(tmp1, Address(obja, result)); 5900 load_unsigned_byte(tmp2, Address(objb, result)); 5901 xorl(tmp1, tmp2); 5902 testl(tmp1, tmp1); 5903 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 5904 decq(length); 5905 jcc(Assembler::zero, SAME_TILL_END); 5906 incq(result); 5907 load_unsigned_byte(tmp1, Address(obja, result)); 5908 load_unsigned_byte(tmp2, Address(objb, result)); 5909 xorl(tmp1, tmp2); 5910 testl(tmp1, tmp1); 5911 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 5912 decq(length); 5913 jcc(Assembler::zero, SAME_TILL_END); 5914 incq(result); 5915 load_unsigned_byte(tmp1, Address(obja, result)); 5916 load_unsigned_byte(tmp2, Address(objb, result)); 5917 xorl(tmp1, tmp2); 5918 testl(tmp1, tmp1); 5919 jcc(Assembler::notZero, BYTES_NOT_EQUAL);//mismatch found 5920 jmp(SAME_TILL_END); 5921 5922 if (UseAVX >= 2) { 5923 bind(VECTOR32_NOT_EQUAL); 5924 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_256bit); 5925 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_256bit); 5926 vpxor(rymm0, rymm0, rymm2, Assembler::AVX_256bit); 5927 vpmovmskb(tmp1, rymm0); 5928 bsfq(tmp1, tmp1); 5929 addq(result, tmp1); 5930 shrq(result); 5931 jmp(DONE); 5932 } 5933 5934 bind(VECTOR16_NOT_EQUAL); 5935 if (UseAVX >= 2) { 5936 vpcmpeqb(rymm2, rymm2, rymm2, Assembler::AVX_128bit); 5937 vpcmpeqb(rymm0, rymm0, rymm1, Assembler::AVX_128bit); 5938 pxor(rymm0, rymm2); 5939 } else { 5940 pcmpeqb(rymm2, rymm2); 5941 pxor(rymm0, rymm1); 5942 pcmpeqb(rymm0, rymm1); 5943 pxor(rymm0, rymm2); 5944 } 5945 pmovmskb(tmp1, rymm0); 5946 bsfq(tmp1, tmp1); 5947 addq(result, tmp1); 5948 shrq(result); 5949 jmpb(DONE); 5950 5951 bind(VECTOR8_NOT_EQUAL); 5952 bind(VECTOR4_NOT_EQUAL); 5953 bsfq(tmp1, tmp1); 5954 shrq(tmp1, 3); 5955 addq(result, tmp1); 5956 bind(BYTES_NOT_EQUAL); 5957 shrq(result); 5958 jmpb(DONE); 5959 5960 bind(SAME_TILL_END); 5961 mov64(result, -1); 5962 5963 bind(DONE); 5964 } 5965 5966 //Helper functions for square_to_len() 5967 5968 /** 5969 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 5970 * Preserves x and z and modifies rest of the registers. 5971 */ 5972 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 5973 // Perform square and right shift by 1 5974 // Handle odd xlen case first, then for even xlen do the following 5975 // jlong carry = 0; 5976 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 5977 // huge_128 product = x[j:j+1] * x[j:j+1]; 5978 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 5979 // z[i+2:i+3] = (jlong)(product >>> 1); 5980 // carry = (jlong)product; 5981 // } 5982 5983 xorq(tmp5, tmp5); // carry 5984 xorq(rdxReg, rdxReg); 5985 xorl(tmp1, tmp1); // index for x 5986 xorl(tmp4, tmp4); // index for z 5987 5988 Label L_first_loop, L_first_loop_exit; 5989 5990 testl(xlen, 1); 5991 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 5992 5993 // Square and right shift by 1 the odd element using 32 bit multiply 5994 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 5995 imulq(raxReg, raxReg); 5996 shrq(raxReg, 1); 5997 adcq(tmp5, 0); 5998 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 5999 incrementl(tmp1); 6000 addl(tmp4, 2); 6001 6002 // Square and right shift by 1 the rest using 64 bit multiply 6003 bind(L_first_loop); 6004 cmpptr(tmp1, xlen); 6005 jccb(Assembler::equal, L_first_loop_exit); 6006 6007 // Square 6008 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 6009 rorq(raxReg, 32); // convert big-endian to little-endian 6010 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 6011 6012 // Right shift by 1 and save carry 6013 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 6014 rcrq(rdxReg, 1); 6015 rcrq(raxReg, 1); 6016 adcq(tmp5, 0); 6017 6018 // Store result in z 6019 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 6020 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 6021 6022 // Update indices for x and z 6023 addl(tmp1, 2); 6024 addl(tmp4, 4); 6025 jmp(L_first_loop); 6026 6027 bind(L_first_loop_exit); 6028 } 6029 6030 6031 /** 6032 * Perform the following multiply add operation using BMI2 instructions 6033 * carry:sum = sum + op1*op2 + carry 6034 * op2 should be in rdx 6035 * op2 is preserved, all other registers are modified 6036 */ 6037 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 6038 // assert op2 is rdx 6039 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 6040 addq(sum, carry); 6041 adcq(tmp2, 0); 6042 addq(sum, op1); 6043 adcq(tmp2, 0); 6044 movq(carry, tmp2); 6045 } 6046 6047 /** 6048 * Perform the following multiply add operation: 6049 * carry:sum = sum + op1*op2 + carry 6050 * Preserves op1, op2 and modifies rest of registers 6051 */ 6052 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 6053 // rdx:rax = op1 * op2 6054 movq(raxReg, op2); 6055 mulq(op1); 6056 6057 // rdx:rax = sum + carry + rdx:rax 6058 addq(sum, carry); 6059 adcq(rdxReg, 0); 6060 addq(sum, raxReg); 6061 adcq(rdxReg, 0); 6062 6063 // carry:sum = rdx:sum 6064 movq(carry, rdxReg); 6065 } 6066 6067 /** 6068 * Add 64 bit long carry into z[] with carry propogation. 6069 * Preserves z and carry register values and modifies rest of registers. 6070 * 6071 */ 6072 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 6073 Label L_fourth_loop, L_fourth_loop_exit; 6074 6075 movl(tmp1, 1); 6076 subl(zlen, 2); 6077 addq(Address(z, zlen, Address::times_4, 0), carry); 6078 6079 bind(L_fourth_loop); 6080 jccb(Assembler::carryClear, L_fourth_loop_exit); 6081 subl(zlen, 2); 6082 jccb(Assembler::negative, L_fourth_loop_exit); 6083 addq(Address(z, zlen, Address::times_4, 0), tmp1); 6084 jmp(L_fourth_loop); 6085 bind(L_fourth_loop_exit); 6086 } 6087 6088 /** 6089 * Shift z[] left by 1 bit. 6090 * Preserves x, len, z and zlen registers and modifies rest of the registers. 6091 * 6092 */ 6093 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 6094 6095 Label L_fifth_loop, L_fifth_loop_exit; 6096 6097 // Fifth loop 6098 // Perform primitiveLeftShift(z, zlen, 1) 6099 6100 const Register prev_carry = tmp1; 6101 const Register new_carry = tmp4; 6102 const Register value = tmp2; 6103 const Register zidx = tmp3; 6104 6105 // int zidx, carry; 6106 // long value; 6107 // carry = 0; 6108 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 6109 // (carry:value) = (z[i] << 1) | carry ; 6110 // z[i] = value; 6111 // } 6112 6113 movl(zidx, zlen); 6114 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 6115 6116 bind(L_fifth_loop); 6117 decl(zidx); // Use decl to preserve carry flag 6118 decl(zidx); 6119 jccb(Assembler::negative, L_fifth_loop_exit); 6120 6121 if (UseBMI2Instructions) { 6122 movq(value, Address(z, zidx, Address::times_4, 0)); 6123 rclq(value, 1); 6124 rorxq(value, value, 32); 6125 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 6126 } 6127 else { 6128 // clear new_carry 6129 xorl(new_carry, new_carry); 6130 6131 // Shift z[i] by 1, or in previous carry and save new carry 6132 movq(value, Address(z, zidx, Address::times_4, 0)); 6133 shlq(value, 1); 6134 adcl(new_carry, 0); 6135 6136 orq(value, prev_carry); 6137 rorq(value, 0x20); 6138 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 6139 6140 // Set previous carry = new carry 6141 movl(prev_carry, new_carry); 6142 } 6143 jmp(L_fifth_loop); 6144 6145 bind(L_fifth_loop_exit); 6146 } 6147 6148 6149 /** 6150 * Code for BigInteger::squareToLen() intrinsic 6151 * 6152 * rdi: x 6153 * rsi: len 6154 * r8: z 6155 * rcx: zlen 6156 * r12: tmp1 6157 * r13: tmp2 6158 * r14: tmp3 6159 * r15: tmp4 6160 * rbx: tmp5 6161 * 6162 */ 6163 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 6164 6165 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, L_last_x, L_multiply; 6166 push(tmp1); 6167 push(tmp2); 6168 push(tmp3); 6169 push(tmp4); 6170 push(tmp5); 6171 6172 // First loop 6173 // Store the squares, right shifted one bit (i.e., divided by 2). 6174 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 6175 6176 // Add in off-diagonal sums. 6177 // 6178 // Second, third (nested) and fourth loops. 6179 // zlen +=2; 6180 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 6181 // carry = 0; 6182 // long op2 = x[xidx:xidx+1]; 6183 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 6184 // k -= 2; 6185 // long op1 = x[j:j+1]; 6186 // long sum = z[k:k+1]; 6187 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 6188 // z[k:k+1] = sum; 6189 // } 6190 // add_one_64(z, k, carry, tmp_regs); 6191 // } 6192 6193 const Register carry = tmp5; 6194 const Register sum = tmp3; 6195 const Register op1 = tmp4; 6196 Register op2 = tmp2; 6197 6198 push(zlen); 6199 push(len); 6200 addl(zlen,2); 6201 bind(L_second_loop); 6202 xorq(carry, carry); 6203 subl(zlen, 4); 6204 subl(len, 2); 6205 push(zlen); 6206 push(len); 6207 cmpl(len, 0); 6208 jccb(Assembler::lessEqual, L_second_loop_exit); 6209 6210 // Multiply an array by one 64 bit long. 6211 if (UseBMI2Instructions) { 6212 op2 = rdxReg; 6213 movq(op2, Address(x, len, Address::times_4, 0)); 6214 rorxq(op2, op2, 32); 6215 } 6216 else { 6217 movq(op2, Address(x, len, Address::times_4, 0)); 6218 rorq(op2, 32); 6219 } 6220 6221 bind(L_third_loop); 6222 decrementl(len); 6223 jccb(Assembler::negative, L_third_loop_exit); 6224 decrementl(len); 6225 jccb(Assembler::negative, L_last_x); 6226 6227 movq(op1, Address(x, len, Address::times_4, 0)); 6228 rorq(op1, 32); 6229 6230 bind(L_multiply); 6231 subl(zlen, 2); 6232 movq(sum, Address(z, zlen, Address::times_4, 0)); 6233 6234 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 6235 if (UseBMI2Instructions) { 6236 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 6237 } 6238 else { 6239 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 6240 } 6241 6242 movq(Address(z, zlen, Address::times_4, 0), sum); 6243 6244 jmp(L_third_loop); 6245 bind(L_third_loop_exit); 6246 6247 // Fourth loop 6248 // Add 64 bit long carry into z with carry propogation. 6249 // Uses offsetted zlen. 6250 add_one_64(z, zlen, carry, tmp1); 6251 6252 pop(len); 6253 pop(zlen); 6254 jmp(L_second_loop); 6255 6256 // Next infrequent code is moved outside loops. 6257 bind(L_last_x); 6258 movl(op1, Address(x, 0)); 6259 jmp(L_multiply); 6260 6261 bind(L_second_loop_exit); 6262 pop(len); 6263 pop(zlen); 6264 pop(len); 6265 pop(zlen); 6266 6267 // Fifth loop 6268 // Shift z left 1 bit. 6269 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 6270 6271 // z[zlen-1] |= x[len-1] & 1; 6272 movl(tmp3, Address(x, len, Address::times_4, -4)); 6273 andl(tmp3, 1); 6274 orl(Address(z, zlen, Address::times_4, -4), tmp3); 6275 6276 pop(tmp5); 6277 pop(tmp4); 6278 pop(tmp3); 6279 pop(tmp2); 6280 pop(tmp1); 6281 } 6282 6283 /** 6284 * Helper function for mul_add() 6285 * Multiply the in[] by int k and add to out[] starting at offset offs using 6286 * 128 bit by 32 bit multiply and return the carry in tmp5. 6287 * Only quad int aligned length of in[] is operated on in this function. 6288 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 6289 * This function preserves out, in and k registers. 6290 * len and offset point to the appropriate index in "in" & "out" correspondingly 6291 * tmp5 has the carry. 6292 * other registers are temporary and are modified. 6293 * 6294 */ 6295 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 6296 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 6297 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 6298 6299 Label L_first_loop, L_first_loop_exit; 6300 6301 movl(tmp1, len); 6302 shrl(tmp1, 2); 6303 6304 bind(L_first_loop); 6305 subl(tmp1, 1); 6306 jccb(Assembler::negative, L_first_loop_exit); 6307 6308 subl(len, 4); 6309 subl(offset, 4); 6310 6311 Register op2 = tmp2; 6312 const Register sum = tmp3; 6313 const Register op1 = tmp4; 6314 const Register carry = tmp5; 6315 6316 if (UseBMI2Instructions) { 6317 op2 = rdxReg; 6318 } 6319 6320 movq(op1, Address(in, len, Address::times_4, 8)); 6321 rorq(op1, 32); 6322 movq(sum, Address(out, offset, Address::times_4, 8)); 6323 rorq(sum, 32); 6324 if (UseBMI2Instructions) { 6325 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 6326 } 6327 else { 6328 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 6329 } 6330 // Store back in big endian from little endian 6331 rorq(sum, 0x20); 6332 movq(Address(out, offset, Address::times_4, 8), sum); 6333 6334 movq(op1, Address(in, len, Address::times_4, 0)); 6335 rorq(op1, 32); 6336 movq(sum, Address(out, offset, Address::times_4, 0)); 6337 rorq(sum, 32); 6338 if (UseBMI2Instructions) { 6339 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 6340 } 6341 else { 6342 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 6343 } 6344 // Store back in big endian from little endian 6345 rorq(sum, 0x20); 6346 movq(Address(out, offset, Address::times_4, 0), sum); 6347 6348 jmp(L_first_loop); 6349 bind(L_first_loop_exit); 6350 } 6351 6352 /** 6353 * Code for BigInteger::mulAdd() intrinsic 6354 * 6355 * rdi: out 6356 * rsi: in 6357 * r11: offs (out.length - offset) 6358 * rcx: len 6359 * r8: k 6360 * r12: tmp1 6361 * r13: tmp2 6362 * r14: tmp3 6363 * r15: tmp4 6364 * rbx: tmp5 6365 * Multiply the in[] by word k and add to out[], return the carry in rax 6366 */ 6367 void MacroAssembler::mul_add(Register out, Register in, Register offs, 6368 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 6369 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 6370 6371 Label L_carry, L_last_in, L_done; 6372 6373 // carry = 0; 6374 // for (int j=len-1; j >= 0; j--) { 6375 // long product = (in[j] & LONG_MASK) * kLong + 6376 // (out[offs] & LONG_MASK) + carry; 6377 // out[offs--] = (int)product; 6378 // carry = product >>> 32; 6379 // } 6380 // 6381 push(tmp1); 6382 push(tmp2); 6383 push(tmp3); 6384 push(tmp4); 6385 push(tmp5); 6386 6387 Register op2 = tmp2; 6388 const Register sum = tmp3; 6389 const Register op1 = tmp4; 6390 const Register carry = tmp5; 6391 6392 if (UseBMI2Instructions) { 6393 op2 = rdxReg; 6394 movl(op2, k); 6395 } 6396 else { 6397 movl(op2, k); 6398 } 6399 6400 xorq(carry, carry); 6401 6402 //First loop 6403 6404 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 6405 //The carry is in tmp5 6406 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 6407 6408 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 6409 decrementl(len); 6410 jccb(Assembler::negative, L_carry); 6411 decrementl(len); 6412 jccb(Assembler::negative, L_last_in); 6413 6414 movq(op1, Address(in, len, Address::times_4, 0)); 6415 rorq(op1, 32); 6416 6417 subl(offs, 2); 6418 movq(sum, Address(out, offs, Address::times_4, 0)); 6419 rorq(sum, 32); 6420 6421 if (UseBMI2Instructions) { 6422 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 6423 } 6424 else { 6425 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 6426 } 6427 6428 // Store back in big endian from little endian 6429 rorq(sum, 0x20); 6430 movq(Address(out, offs, Address::times_4, 0), sum); 6431 6432 testl(len, len); 6433 jccb(Assembler::zero, L_carry); 6434 6435 //Multiply the last in[] entry, if any 6436 bind(L_last_in); 6437 movl(op1, Address(in, 0)); 6438 movl(sum, Address(out, offs, Address::times_4, -4)); 6439 6440 movl(raxReg, k); 6441 mull(op1); //tmp4 * eax -> edx:eax 6442 addl(sum, carry); 6443 adcl(rdxReg, 0); 6444 addl(sum, raxReg); 6445 adcl(rdxReg, 0); 6446 movl(carry, rdxReg); 6447 6448 movl(Address(out, offs, Address::times_4, -4), sum); 6449 6450 bind(L_carry); 6451 //return tmp5/carry as carry in rax 6452 movl(rax, carry); 6453 6454 bind(L_done); 6455 pop(tmp5); 6456 pop(tmp4); 6457 pop(tmp3); 6458 pop(tmp2); 6459 pop(tmp1); 6460 } 6461 #endif 6462 6463 /** 6464 * Emits code to update CRC-32 with a byte value according to constants in table 6465 * 6466 * @param [in,out]crc Register containing the crc. 6467 * @param [in]val Register containing the byte to fold into the CRC. 6468 * @param [in]table Register containing the table of crc constants. 6469 * 6470 * uint32_t crc; 6471 * val = crc_table[(val ^ crc) & 0xFF]; 6472 * crc = val ^ (crc >> 8); 6473 * 6474 */ 6475 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 6476 xorl(val, crc); 6477 andl(val, 0xFF); 6478 shrl(crc, 8); // unsigned shift 6479 xorl(crc, Address(table, val, Address::times_4, 0)); 6480 } 6481 6482 /** 6483 * Fold 128-bit data chunk 6484 */ 6485 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 6486 if (UseAVX > 0) { 6487 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 6488 vpclmulldq(xcrc, xK, xcrc); // [63:0] 6489 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 6490 pxor(xcrc, xtmp); 6491 } else { 6492 movdqa(xtmp, xcrc); 6493 pclmulhdq(xtmp, xK); // [123:64] 6494 pclmulldq(xcrc, xK); // [63:0] 6495 pxor(xcrc, xtmp); 6496 movdqu(xtmp, Address(buf, offset)); 6497 pxor(xcrc, xtmp); 6498 } 6499 } 6500 6501 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 6502 if (UseAVX > 0) { 6503 vpclmulhdq(xtmp, xK, xcrc); 6504 vpclmulldq(xcrc, xK, xcrc); 6505 pxor(xcrc, xbuf); 6506 pxor(xcrc, xtmp); 6507 } else { 6508 movdqa(xtmp, xcrc); 6509 pclmulhdq(xtmp, xK); 6510 pclmulldq(xcrc, xK); 6511 pxor(xcrc, xbuf); 6512 pxor(xcrc, xtmp); 6513 } 6514 } 6515 6516 /** 6517 * 8-bit folds to compute 32-bit CRC 6518 * 6519 * uint64_t xcrc; 6520 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 6521 */ 6522 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 6523 movdl(tmp, xcrc); 6524 andl(tmp, 0xFF); 6525 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 6526 psrldq(xcrc, 1); // unsigned shift one byte 6527 pxor(xcrc, xtmp); 6528 } 6529 6530 /** 6531 * uint32_t crc; 6532 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 6533 */ 6534 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 6535 movl(tmp, crc); 6536 andl(tmp, 0xFF); 6537 shrl(crc, 8); 6538 xorl(crc, Address(table, tmp, Address::times_4, 0)); 6539 } 6540 6541 /** 6542 * @param crc register containing existing CRC (32-bit) 6543 * @param buf register pointing to input byte buffer (byte*) 6544 * @param len register containing number of bytes 6545 * @param table register that will contain address of CRC table 6546 * @param tmp scratch register 6547 */ 6548 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 6549 assert_different_registers(crc, buf, len, table, tmp, rax); 6550 6551 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 6552 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 6553 6554 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 6555 // context for the registers used, where all instructions below are using 128-bit mode 6556 // On EVEX without VL and BW, these instructions will all be AVX. 6557 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 6558 notl(crc); // ~crc 6559 cmpl(len, 16); 6560 jcc(Assembler::less, L_tail); 6561 6562 // Align buffer to 16 bytes 6563 movl(tmp, buf); 6564 andl(tmp, 0xF); 6565 jccb(Assembler::zero, L_aligned); 6566 subl(tmp, 16); 6567 addl(len, tmp); 6568 6569 align(4); 6570 BIND(L_align_loop); 6571 movsbl(rax, Address(buf, 0)); // load byte with sign extension 6572 update_byte_crc32(crc, rax, table); 6573 increment(buf); 6574 incrementl(tmp); 6575 jccb(Assembler::less, L_align_loop); 6576 6577 BIND(L_aligned); 6578 movl(tmp, len); // save 6579 shrl(len, 4); 6580 jcc(Assembler::zero, L_tail_restore); 6581 6582 // Fold crc into first bytes of vector 6583 movdqa(xmm1, Address(buf, 0)); 6584 movdl(rax, xmm1); 6585 xorl(crc, rax); 6586 if (VM_Version::supports_sse4_1()) { 6587 pinsrd(xmm1, crc, 0); 6588 } else { 6589 pinsrw(xmm1, crc, 0); 6590 shrl(crc, 16); 6591 pinsrw(xmm1, crc, 1); 6592 } 6593 addptr(buf, 16); 6594 subl(len, 4); // len > 0 6595 jcc(Assembler::less, L_fold_tail); 6596 6597 movdqa(xmm2, Address(buf, 0)); 6598 movdqa(xmm3, Address(buf, 16)); 6599 movdqa(xmm4, Address(buf, 32)); 6600 addptr(buf, 48); 6601 subl(len, 3); 6602 jcc(Assembler::lessEqual, L_fold_512b); 6603 6604 // Fold total 512 bits of polynomial on each iteration, 6605 // 128 bits per each of 4 parallel streams. 6606 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); 6607 6608 align(32); 6609 BIND(L_fold_512b_loop); 6610 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 6611 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 6612 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 6613 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 6614 addptr(buf, 64); 6615 subl(len, 4); 6616 jcc(Assembler::greater, L_fold_512b_loop); 6617 6618 // Fold 512 bits to 128 bits. 6619 BIND(L_fold_512b); 6620 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 6621 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 6622 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 6623 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 6624 6625 // Fold the rest of 128 bits data chunks 6626 BIND(L_fold_tail); 6627 addl(len, 3); 6628 jccb(Assembler::lessEqual, L_fold_128b); 6629 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 6630 6631 BIND(L_fold_tail_loop); 6632 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 6633 addptr(buf, 16); 6634 decrementl(len); 6635 jccb(Assembler::greater, L_fold_tail_loop); 6636 6637 // Fold 128 bits in xmm1 down into 32 bits in crc register. 6638 BIND(L_fold_128b); 6639 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 6640 if (UseAVX > 0) { 6641 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 6642 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 6643 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 6644 } else { 6645 movdqa(xmm2, xmm0); 6646 pclmulqdq(xmm2, xmm1, 0x1); 6647 movdqa(xmm3, xmm0); 6648 pand(xmm3, xmm2); 6649 pclmulqdq(xmm0, xmm3, 0x1); 6650 } 6651 psrldq(xmm1, 8); 6652 psrldq(xmm2, 4); 6653 pxor(xmm0, xmm1); 6654 pxor(xmm0, xmm2); 6655 6656 // 8 8-bit folds to compute 32-bit CRC. 6657 for (int j = 0; j < 4; j++) { 6658 fold_8bit_crc32(xmm0, table, xmm1, rax); 6659 } 6660 movdl(crc, xmm0); // mov 32 bits to general register 6661 for (int j = 0; j < 4; j++) { 6662 fold_8bit_crc32(crc, table, rax); 6663 } 6664 6665 BIND(L_tail_restore); 6666 movl(len, tmp); // restore 6667 BIND(L_tail); 6668 andl(len, 0xf); 6669 jccb(Assembler::zero, L_exit); 6670 6671 // Fold the rest of bytes 6672 align(4); 6673 BIND(L_tail_loop); 6674 movsbl(rax, Address(buf, 0)); // load byte with sign extension 6675 update_byte_crc32(crc, rax, table); 6676 increment(buf); 6677 decrementl(len); 6678 jccb(Assembler::greater, L_tail_loop); 6679 6680 BIND(L_exit); 6681 notl(crc); // ~c 6682 } 6683 6684 #ifdef _LP64 6685 // Helper function for AVX 512 CRC32 6686 // Fold 512-bit data chunks 6687 void MacroAssembler::fold512bit_crc32_avx512(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, 6688 Register pos, int offset) { 6689 evmovdquq(xmm3, Address(buf, pos, Address::times_1, offset), Assembler::AVX_512bit); 6690 evpclmulqdq(xtmp, xcrc, xK, 0x10, Assembler::AVX_512bit); // [123:64] 6691 evpclmulqdq(xmm2, xcrc, xK, 0x01, Assembler::AVX_512bit); // [63:0] 6692 evpxorq(xcrc, xtmp, xmm2, Assembler::AVX_512bit /* vector_len */); 6693 evpxorq(xcrc, xcrc, xmm3, Assembler::AVX_512bit /* vector_len */); 6694 } 6695 6696 // Helper function for AVX 512 CRC32 6697 // Compute CRC32 for < 256B buffers 6698 void MacroAssembler::kernel_crc32_avx512_256B(Register crc, Register buf, Register len, Register key, Register pos, 6699 Register tmp1, Register tmp2, Label& L_barrett, Label& L_16B_reduction_loop, 6700 Label& L_get_last_two_xmms, Label& L_128_done, Label& L_cleanup) { 6701 6702 Label L_less_than_32, L_exact_16_left, L_less_than_16_left; 6703 Label L_less_than_8_left, L_less_than_4_left, L_less_than_2_left, L_zero_left; 6704 Label L_only_less_than_4, L_only_less_than_3, L_only_less_than_2; 6705 6706 // check if there is enough buffer to be able to fold 16B at a time 6707 cmpl(len, 32); 6708 jcc(Assembler::less, L_less_than_32); 6709 6710 // if there is, load the constants 6711 movdqu(xmm10, Address(key, 1 * 16)); //rk1 and rk2 in xmm10 6712 movdl(xmm0, crc); // get the initial crc value 6713 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 6714 pxor(xmm7, xmm0); 6715 6716 // update the buffer pointer 6717 addl(pos, 16); 6718 //update the counter.subtract 32 instead of 16 to save one instruction from the loop 6719 subl(len, 32); 6720 jmp(L_16B_reduction_loop); 6721 6722 bind(L_less_than_32); 6723 //mov initial crc to the return value. this is necessary for zero - length buffers. 6724 movl(rax, crc); 6725 testl(len, len); 6726 jcc(Assembler::equal, L_cleanup); 6727 6728 movdl(xmm0, crc); //get the initial crc value 6729 6730 cmpl(len, 16); 6731 jcc(Assembler::equal, L_exact_16_left); 6732 jcc(Assembler::less, L_less_than_16_left); 6733 6734 movdqu(xmm7, Address(buf, pos, Address::times_1, 0 * 16)); //load the plaintext 6735 pxor(xmm7, xmm0); //xor the initial crc value 6736 addl(pos, 16); 6737 subl(len, 16); 6738 movdqu(xmm10, Address(key, 1 * 16)); // rk1 and rk2 in xmm10 6739 jmp(L_get_last_two_xmms); 6740 6741 bind(L_less_than_16_left); 6742 //use stack space to load data less than 16 bytes, zero - out the 16B in memory first. 6743 pxor(xmm1, xmm1); 6744 movptr(tmp1, rsp); 6745 movdqu(Address(tmp1, 0 * 16), xmm1); 6746 6747 cmpl(len, 4); 6748 jcc(Assembler::less, L_only_less_than_4); 6749 6750 //backup the counter value 6751 movl(tmp2, len); 6752 cmpl(len, 8); 6753 jcc(Assembler::less, L_less_than_8_left); 6754 6755 //load 8 Bytes 6756 movq(rax, Address(buf, pos, Address::times_1, 0 * 16)); 6757 movq(Address(tmp1, 0 * 16), rax); 6758 addptr(tmp1, 8); 6759 subl(len, 8); 6760 addl(pos, 8); 6761 6762 bind(L_less_than_8_left); 6763 cmpl(len, 4); 6764 jcc(Assembler::less, L_less_than_4_left); 6765 6766 //load 4 Bytes 6767 movl(rax, Address(buf, pos, Address::times_1, 0)); 6768 movl(Address(tmp1, 0 * 16), rax); 6769 addptr(tmp1, 4); 6770 subl(len, 4); 6771 addl(pos, 4); 6772 6773 bind(L_less_than_4_left); 6774 cmpl(len, 2); 6775 jcc(Assembler::less, L_less_than_2_left); 6776 6777 // load 2 Bytes 6778 movw(rax, Address(buf, pos, Address::times_1, 0)); 6779 movl(Address(tmp1, 0 * 16), rax); 6780 addptr(tmp1, 2); 6781 subl(len, 2); 6782 addl(pos, 2); 6783 6784 bind(L_less_than_2_left); 6785 cmpl(len, 1); 6786 jcc(Assembler::less, L_zero_left); 6787 6788 // load 1 Byte 6789 movb(rax, Address(buf, pos, Address::times_1, 0)); 6790 movb(Address(tmp1, 0 * 16), rax); 6791 6792 bind(L_zero_left); 6793 movdqu(xmm7, Address(rsp, 0)); 6794 pxor(xmm7, xmm0); //xor the initial crc value 6795 6796 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 6797 movdqu(xmm0, Address(rax, tmp2)); 6798 pshufb(xmm7, xmm0); 6799 jmp(L_128_done); 6800 6801 bind(L_exact_16_left); 6802 movdqu(xmm7, Address(buf, pos, Address::times_1, 0)); 6803 pxor(xmm7, xmm0); //xor the initial crc value 6804 jmp(L_128_done); 6805 6806 bind(L_only_less_than_4); 6807 cmpl(len, 3); 6808 jcc(Assembler::less, L_only_less_than_3); 6809 6810 // load 3 Bytes 6811 movb(rax, Address(buf, pos, Address::times_1, 0)); 6812 movb(Address(tmp1, 0), rax); 6813 6814 movb(rax, Address(buf, pos, Address::times_1, 1)); 6815 movb(Address(tmp1, 1), rax); 6816 6817 movb(rax, Address(buf, pos, Address::times_1, 2)); 6818 movb(Address(tmp1, 2), rax); 6819 6820 movdqu(xmm7, Address(rsp, 0)); 6821 pxor(xmm7, xmm0); //xor the initial crc value 6822 6823 pslldq(xmm7, 0x5); 6824 jmp(L_barrett); 6825 bind(L_only_less_than_3); 6826 cmpl(len, 2); 6827 jcc(Assembler::less, L_only_less_than_2); 6828 6829 // load 2 Bytes 6830 movb(rax, Address(buf, pos, Address::times_1, 0)); 6831 movb(Address(tmp1, 0), rax); 6832 6833 movb(rax, Address(buf, pos, Address::times_1, 1)); 6834 movb(Address(tmp1, 1), rax); 6835 6836 movdqu(xmm7, Address(rsp, 0)); 6837 pxor(xmm7, xmm0); //xor the initial crc value 6838 6839 pslldq(xmm7, 0x6); 6840 jmp(L_barrett); 6841 6842 bind(L_only_less_than_2); 6843 //load 1 Byte 6844 movb(rax, Address(buf, pos, Address::times_1, 0)); 6845 movb(Address(tmp1, 0), rax); 6846 6847 movdqu(xmm7, Address(rsp, 0)); 6848 pxor(xmm7, xmm0); //xor the initial crc value 6849 6850 pslldq(xmm7, 0x7); 6851 } 6852 6853 /** 6854 * Compute CRC32 using AVX512 instructions 6855 * param crc register containing existing CRC (32-bit) 6856 * param buf register pointing to input byte buffer (byte*) 6857 * param len register containing number of bytes 6858 * param tmp1 scratch register 6859 * param tmp2 scratch register 6860 * return rax result register 6861 */ 6862 void MacroAssembler::kernel_crc32_avx512(Register crc, Register buf, Register len, Register key, Register tmp1, Register tmp2) { 6863 assert_different_registers(crc, buf, len, key, tmp1, tmp2, rax); 6864 6865 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 6866 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 6867 Label L_less_than_256, L_fold_128_B_loop, L_fold_256_B_loop; 6868 Label L_fold_128_B_register, L_final_reduction_for_128, L_16B_reduction_loop; 6869 Label L_128_done, L_get_last_two_xmms, L_barrett, L_cleanup; 6870 6871 const Register pos = r12; 6872 push(r12); 6873 subptr(rsp, 16 * 2 + 8); 6874 6875 // For EVEX with VL and BW, provide a standard mask, VL = 128 will guide the merge 6876 // context for the registers used, where all instructions below are using 128-bit mode 6877 // On EVEX without VL and BW, these instructions will all be AVX. 6878 lea(key, ExternalAddress(StubRoutines::x86::crc_table_avx512_addr())); 6879 notl(crc); 6880 movl(pos, 0); 6881 6882 // check if smaller than 256B 6883 cmpl(len, 256); 6884 jcc(Assembler::less, L_less_than_256); 6885 6886 // load the initial crc value 6887 movdl(xmm10, crc); 6888 6889 // receive the initial 64B data, xor the initial crc value 6890 evmovdquq(xmm0, Address(buf, pos, Address::times_1, 0 * 64), Assembler::AVX_512bit); 6891 evmovdquq(xmm4, Address(buf, pos, Address::times_1, 1 * 64), Assembler::AVX_512bit); 6892 evpxorq(xmm0, xmm0, xmm10, Assembler::AVX_512bit); 6893 evbroadcasti32x4(xmm10, Address(key, 2 * 16), Assembler::AVX_512bit); //zmm10 has rk3 and rk4 6894 6895 subl(len, 256); 6896 cmpl(len, 256); 6897 jcc(Assembler::less, L_fold_128_B_loop); 6898 6899 evmovdquq(xmm7, Address(buf, pos, Address::times_1, 2 * 64), Assembler::AVX_512bit); 6900 evmovdquq(xmm8, Address(buf, pos, Address::times_1, 3 * 64), Assembler::AVX_512bit); 6901 evbroadcasti32x4(xmm16, Address(key, 0 * 16), Assembler::AVX_512bit); //zmm16 has rk-1 and rk-2 6902 subl(len, 256); 6903 6904 bind(L_fold_256_B_loop); 6905 addl(pos, 256); 6906 fold512bit_crc32_avx512(xmm0, xmm16, xmm1, buf, pos, 0 * 64); 6907 fold512bit_crc32_avx512(xmm4, xmm16, xmm1, buf, pos, 1 * 64); 6908 fold512bit_crc32_avx512(xmm7, xmm16, xmm1, buf, pos, 2 * 64); 6909 fold512bit_crc32_avx512(xmm8, xmm16, xmm1, buf, pos, 3 * 64); 6910 6911 subl(len, 256); 6912 jcc(Assembler::greaterEqual, L_fold_256_B_loop); 6913 6914 // Fold 256 into 128 6915 addl(pos, 256); 6916 evpclmulqdq(xmm1, xmm0, xmm10, 0x01, Assembler::AVX_512bit); 6917 evpclmulqdq(xmm2, xmm0, xmm10, 0x10, Assembler::AVX_512bit); 6918 vpternlogq(xmm7, 0x96, xmm1, xmm2, Assembler::AVX_512bit); // xor ABC 6919 6920 evpclmulqdq(xmm5, xmm4, xmm10, 0x01, Assembler::AVX_512bit); 6921 evpclmulqdq(xmm6, xmm4, xmm10, 0x10, Assembler::AVX_512bit); 6922 vpternlogq(xmm8, 0x96, xmm5, xmm6, Assembler::AVX_512bit); // xor ABC 6923 6924 evmovdquq(xmm0, xmm7, Assembler::AVX_512bit); 6925 evmovdquq(xmm4, xmm8, Assembler::AVX_512bit); 6926 6927 addl(len, 128); 6928 jmp(L_fold_128_B_register); 6929 6930 // at this section of the code, there is 128 * x + y(0 <= y<128) bytes of buffer.The fold_128_B_loop 6931 // loop will fold 128B at a time until we have 128 + y Bytes of buffer 6932 6933 // fold 128B at a time.This section of the code folds 8 xmm registers in parallel 6934 bind(L_fold_128_B_loop); 6935 addl(pos, 128); 6936 fold512bit_crc32_avx512(xmm0, xmm10, xmm1, buf, pos, 0 * 64); 6937 fold512bit_crc32_avx512(xmm4, xmm10, xmm1, buf, pos, 1 * 64); 6938 6939 subl(len, 128); 6940 jcc(Assembler::greaterEqual, L_fold_128_B_loop); 6941 6942 addl(pos, 128); 6943 6944 // at this point, the buffer pointer is pointing at the last y Bytes of the buffer, where 0 <= y < 128 6945 // the 128B of folded data is in 8 of the xmm registers : xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7 6946 bind(L_fold_128_B_register); 6947 evmovdquq(xmm16, Address(key, 5 * 16), Assembler::AVX_512bit); // multiply by rk9-rk16 6948 evmovdquq(xmm11, Address(key, 9 * 16), Assembler::AVX_512bit); // multiply by rk17-rk20, rk1,rk2, 0,0 6949 evpclmulqdq(xmm1, xmm0, xmm16, 0x01, Assembler::AVX_512bit); 6950 evpclmulqdq(xmm2, xmm0, xmm16, 0x10, Assembler::AVX_512bit); 6951 // save last that has no multiplicand 6952 vextracti64x2(xmm7, xmm4, 3); 6953 6954 evpclmulqdq(xmm5, xmm4, xmm11, 0x01, Assembler::AVX_512bit); 6955 evpclmulqdq(xmm6, xmm4, xmm11, 0x10, Assembler::AVX_512bit); 6956 // Needed later in reduction loop 6957 movdqu(xmm10, Address(key, 1 * 16)); 6958 vpternlogq(xmm1, 0x96, xmm2, xmm5, Assembler::AVX_512bit); // xor ABC 6959 vpternlogq(xmm1, 0x96, xmm6, xmm7, Assembler::AVX_512bit); // xor ABC 6960 6961 // Swap 1,0,3,2 - 01 00 11 10 6962 evshufi64x2(xmm8, xmm1, xmm1, 0x4e, Assembler::AVX_512bit); 6963 evpxorq(xmm8, xmm8, xmm1, Assembler::AVX_256bit); 6964 vextracti128(xmm5, xmm8, 1); 6965 evpxorq(xmm7, xmm5, xmm8, Assembler::AVX_128bit); 6966 6967 // instead of 128, we add 128 - 16 to the loop counter to save 1 instruction from the loop 6968 // instead of a cmp instruction, we use the negative flag with the jl instruction 6969 addl(len, 128 - 16); 6970 jcc(Assembler::less, L_final_reduction_for_128); 6971 6972 bind(L_16B_reduction_loop); 6973 vpclmulqdq(xmm8, xmm7, xmm10, 0x1); 6974 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 6975 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 6976 movdqu(xmm0, Address(buf, pos, Address::times_1, 0 * 16)); 6977 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 6978 addl(pos, 16); 6979 subl(len, 16); 6980 jcc(Assembler::greaterEqual, L_16B_reduction_loop); 6981 6982 bind(L_final_reduction_for_128); 6983 addl(len, 16); 6984 jcc(Assembler::equal, L_128_done); 6985 6986 bind(L_get_last_two_xmms); 6987 movdqu(xmm2, xmm7); 6988 addl(pos, len); 6989 movdqu(xmm1, Address(buf, pos, Address::times_1, -16)); 6990 subl(pos, len); 6991 6992 // get rid of the extra data that was loaded before 6993 // load the shift constant 6994 lea(rax, ExternalAddress(StubRoutines::x86::shuf_table_crc32_avx512_addr())); 6995 movdqu(xmm0, Address(rax, len)); 6996 addl(rax, len); 6997 6998 vpshufb(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 6999 //Change mask to 512 7000 vpxor(xmm0, xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 2 * 16), Assembler::AVX_128bit, tmp2); 7001 vpshufb(xmm2, xmm2, xmm0, Assembler::AVX_128bit); 7002 7003 blendvpb(xmm2, xmm2, xmm1, xmm0, Assembler::AVX_128bit); 7004 vpclmulqdq(xmm8, xmm7, xmm10, 0x1); 7005 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 7006 vpxor(xmm7, xmm7, xmm8, Assembler::AVX_128bit); 7007 vpxor(xmm7, xmm7, xmm2, Assembler::AVX_128bit); 7008 7009 bind(L_128_done); 7010 // compute crc of a 128-bit value 7011 movdqu(xmm10, Address(key, 3 * 16)); 7012 movdqu(xmm0, xmm7); 7013 7014 // 64b fold 7015 vpclmulqdq(xmm7, xmm7, xmm10, 0x0); 7016 vpsrldq(xmm0, xmm0, 0x8, Assembler::AVX_128bit); 7017 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 7018 7019 // 32b fold 7020 movdqu(xmm0, xmm7); 7021 vpslldq(xmm7, xmm7, 0x4, Assembler::AVX_128bit); 7022 vpclmulqdq(xmm7, xmm7, xmm10, 0x10); 7023 vpxor(xmm7, xmm7, xmm0, Assembler::AVX_128bit); 7024 jmp(L_barrett); 7025 7026 bind(L_less_than_256); 7027 kernel_crc32_avx512_256B(crc, buf, len, key, pos, tmp1, tmp2, L_barrett, L_16B_reduction_loop, L_get_last_two_xmms, L_128_done, L_cleanup); 7028 7029 //barrett reduction 7030 bind(L_barrett); 7031 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr() + 1 * 16), Assembler::AVX_128bit, tmp2); 7032 movdqu(xmm1, xmm7); 7033 movdqu(xmm2, xmm7); 7034 movdqu(xmm10, Address(key, 4 * 16)); 7035 7036 pclmulqdq(xmm7, xmm10, 0x0); 7037 pxor(xmm7, xmm2); 7038 vpand(xmm7, xmm7, ExternalAddress(StubRoutines::x86::crc_by128_masks_avx512_addr()), Assembler::AVX_128bit, tmp2); 7039 movdqu(xmm2, xmm7); 7040 pclmulqdq(xmm7, xmm10, 0x10); 7041 pxor(xmm7, xmm2); 7042 pxor(xmm7, xmm1); 7043 pextrd(crc, xmm7, 2); 7044 7045 bind(L_cleanup); 7046 notl(crc); // ~c 7047 addptr(rsp, 16 * 2 + 8); 7048 pop(r12); 7049 } 7050 7051 // S. Gueron / Information Processing Letters 112 (2012) 184 7052 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 7053 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 7054 // Output: the 64-bit carry-less product of B * CONST 7055 void MacroAssembler::crc32c_ipl_alg4(Register in, uint32_t n, 7056 Register tmp1, Register tmp2, Register tmp3) { 7057 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 7058 if (n > 0) { 7059 addq(tmp3, n * 256 * 8); 7060 } 7061 // Q1 = TABLEExt[n][B & 0xFF]; 7062 movl(tmp1, in); 7063 andl(tmp1, 0x000000FF); 7064 shll(tmp1, 3); 7065 addq(tmp1, tmp3); 7066 movq(tmp1, Address(tmp1, 0)); 7067 7068 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 7069 movl(tmp2, in); 7070 shrl(tmp2, 8); 7071 andl(tmp2, 0x000000FF); 7072 shll(tmp2, 3); 7073 addq(tmp2, tmp3); 7074 movq(tmp2, Address(tmp2, 0)); 7075 7076 shlq(tmp2, 8); 7077 xorq(tmp1, tmp2); 7078 7079 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 7080 movl(tmp2, in); 7081 shrl(tmp2, 16); 7082 andl(tmp2, 0x000000FF); 7083 shll(tmp2, 3); 7084 addq(tmp2, tmp3); 7085 movq(tmp2, Address(tmp2, 0)); 7086 7087 shlq(tmp2, 16); 7088 xorq(tmp1, tmp2); 7089 7090 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 7091 shrl(in, 24); 7092 andl(in, 0x000000FF); 7093 shll(in, 3); 7094 addq(in, tmp3); 7095 movq(in, Address(in, 0)); 7096 7097 shlq(in, 24); 7098 xorq(in, tmp1); 7099 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 7100 } 7101 7102 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 7103 Register in_out, 7104 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 7105 XMMRegister w_xtmp2, 7106 Register tmp1, 7107 Register n_tmp2, Register n_tmp3) { 7108 if (is_pclmulqdq_supported) { 7109 movdl(w_xtmp1, in_out); // modified blindly 7110 7111 movl(tmp1, const_or_pre_comp_const_index); 7112 movdl(w_xtmp2, tmp1); 7113 pclmulqdq(w_xtmp1, w_xtmp2, 0); 7114 7115 movdq(in_out, w_xtmp1); 7116 } else { 7117 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3); 7118 } 7119 } 7120 7121 // Recombination Alternative 2: No bit-reflections 7122 // T1 = (CRC_A * U1) << 1 7123 // T2 = (CRC_B * U2) << 1 7124 // C1 = T1 >> 32 7125 // C2 = T2 >> 32 7126 // T1 = T1 & 0xFFFFFFFF 7127 // T2 = T2 & 0xFFFFFFFF 7128 // T1 = CRC32(0, T1) 7129 // T2 = CRC32(0, T2) 7130 // C1 = C1 ^ T1 7131 // C2 = C2 ^ T2 7132 // CRC = C1 ^ C2 ^ CRC_C 7133 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 7134 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7135 Register tmp1, Register tmp2, 7136 Register n_tmp3) { 7137 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 7138 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 7139 shlq(in_out, 1); 7140 movl(tmp1, in_out); 7141 shrq(in_out, 32); 7142 xorl(tmp2, tmp2); 7143 crc32(tmp2, tmp1, 4); 7144 xorl(in_out, tmp2); // we don't care about upper 32 bit contents here 7145 shlq(in1, 1); 7146 movl(tmp1, in1); 7147 shrq(in1, 32); 7148 xorl(tmp2, tmp2); 7149 crc32(tmp2, tmp1, 4); 7150 xorl(in1, tmp2); 7151 xorl(in_out, in1); 7152 xorl(in_out, in2); 7153 } 7154 7155 // Set N to predefined value 7156 // Subtract from a lenght of a buffer 7157 // execute in a loop: 7158 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 7159 // for i = 1 to N do 7160 // CRC_A = CRC32(CRC_A, A[i]) 7161 // CRC_B = CRC32(CRC_B, B[i]) 7162 // CRC_C = CRC32(CRC_C, C[i]) 7163 // end for 7164 // Recombine 7165 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 7166 Register in_out1, Register in_out2, Register in_out3, 7167 Register tmp1, Register tmp2, Register tmp3, 7168 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7169 Register tmp4, Register tmp5, 7170 Register n_tmp6) { 7171 Label L_processPartitions; 7172 Label L_processPartition; 7173 Label L_exit; 7174 7175 bind(L_processPartitions); 7176 cmpl(in_out1, 3 * size); 7177 jcc(Assembler::less, L_exit); 7178 xorl(tmp1, tmp1); 7179 xorl(tmp2, tmp2); 7180 movq(tmp3, in_out2); 7181 addq(tmp3, size); 7182 7183 bind(L_processPartition); 7184 crc32(in_out3, Address(in_out2, 0), 8); 7185 crc32(tmp1, Address(in_out2, size), 8); 7186 crc32(tmp2, Address(in_out2, size * 2), 8); 7187 addq(in_out2, 8); 7188 cmpq(in_out2, tmp3); 7189 jcc(Assembler::less, L_processPartition); 7190 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 7191 w_xtmp1, w_xtmp2, w_xtmp3, 7192 tmp4, tmp5, 7193 n_tmp6); 7194 addq(in_out2, 2 * size); 7195 subl(in_out1, 3 * size); 7196 jmp(L_processPartitions); 7197 7198 bind(L_exit); 7199 } 7200 #else 7201 void MacroAssembler::crc32c_ipl_alg4(Register in_out, uint32_t n, 7202 Register tmp1, Register tmp2, Register tmp3, 7203 XMMRegister xtmp1, XMMRegister xtmp2) { 7204 lea(tmp3, ExternalAddress(StubRoutines::crc32c_table_addr())); 7205 if (n > 0) { 7206 addl(tmp3, n * 256 * 8); 7207 } 7208 // Q1 = TABLEExt[n][B & 0xFF]; 7209 movl(tmp1, in_out); 7210 andl(tmp1, 0x000000FF); 7211 shll(tmp1, 3); 7212 addl(tmp1, tmp3); 7213 movq(xtmp1, Address(tmp1, 0)); 7214 7215 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 7216 movl(tmp2, in_out); 7217 shrl(tmp2, 8); 7218 andl(tmp2, 0x000000FF); 7219 shll(tmp2, 3); 7220 addl(tmp2, tmp3); 7221 movq(xtmp2, Address(tmp2, 0)); 7222 7223 psllq(xtmp2, 8); 7224 pxor(xtmp1, xtmp2); 7225 7226 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 7227 movl(tmp2, in_out); 7228 shrl(tmp2, 16); 7229 andl(tmp2, 0x000000FF); 7230 shll(tmp2, 3); 7231 addl(tmp2, tmp3); 7232 movq(xtmp2, Address(tmp2, 0)); 7233 7234 psllq(xtmp2, 16); 7235 pxor(xtmp1, xtmp2); 7236 7237 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 7238 shrl(in_out, 24); 7239 andl(in_out, 0x000000FF); 7240 shll(in_out, 3); 7241 addl(in_out, tmp3); 7242 movq(xtmp2, Address(in_out, 0)); 7243 7244 psllq(xtmp2, 24); 7245 pxor(xtmp1, xtmp2); // Result in CXMM 7246 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 7247 } 7248 7249 void MacroAssembler::crc32c_pclmulqdq(XMMRegister w_xtmp1, 7250 Register in_out, 7251 uint32_t const_or_pre_comp_const_index, bool is_pclmulqdq_supported, 7252 XMMRegister w_xtmp2, 7253 Register tmp1, 7254 Register n_tmp2, Register n_tmp3) { 7255 if (is_pclmulqdq_supported) { 7256 movdl(w_xtmp1, in_out); 7257 7258 movl(tmp1, const_or_pre_comp_const_index); 7259 movdl(w_xtmp2, tmp1); 7260 pclmulqdq(w_xtmp1, w_xtmp2, 0); 7261 // Keep result in XMM since GPR is 32 bit in length 7262 } else { 7263 crc32c_ipl_alg4(in_out, const_or_pre_comp_const_index, tmp1, n_tmp2, n_tmp3, w_xtmp1, w_xtmp2); 7264 } 7265 } 7266 7267 void MacroAssembler::crc32c_rec_alt2(uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, Register in_out, Register in1, Register in2, 7268 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7269 Register tmp1, Register tmp2, 7270 Register n_tmp3) { 7271 crc32c_pclmulqdq(w_xtmp1, in_out, const_or_pre_comp_const_index_u1, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 7272 crc32c_pclmulqdq(w_xtmp2, in1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, w_xtmp3, tmp1, tmp2, n_tmp3); 7273 7274 psllq(w_xtmp1, 1); 7275 movdl(tmp1, w_xtmp1); 7276 psrlq(w_xtmp1, 32); 7277 movdl(in_out, w_xtmp1); 7278 7279 xorl(tmp2, tmp2); 7280 crc32(tmp2, tmp1, 4); 7281 xorl(in_out, tmp2); 7282 7283 psllq(w_xtmp2, 1); 7284 movdl(tmp1, w_xtmp2); 7285 psrlq(w_xtmp2, 32); 7286 movdl(in1, w_xtmp2); 7287 7288 xorl(tmp2, tmp2); 7289 crc32(tmp2, tmp1, 4); 7290 xorl(in1, tmp2); 7291 xorl(in_out, in1); 7292 xorl(in_out, in2); 7293 } 7294 7295 void MacroAssembler::crc32c_proc_chunk(uint32_t size, uint32_t const_or_pre_comp_const_index_u1, uint32_t const_or_pre_comp_const_index_u2, bool is_pclmulqdq_supported, 7296 Register in_out1, Register in_out2, Register in_out3, 7297 Register tmp1, Register tmp2, Register tmp3, 7298 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7299 Register tmp4, Register tmp5, 7300 Register n_tmp6) { 7301 Label L_processPartitions; 7302 Label L_processPartition; 7303 Label L_exit; 7304 7305 bind(L_processPartitions); 7306 cmpl(in_out1, 3 * size); 7307 jcc(Assembler::less, L_exit); 7308 xorl(tmp1, tmp1); 7309 xorl(tmp2, tmp2); 7310 movl(tmp3, in_out2); 7311 addl(tmp3, size); 7312 7313 bind(L_processPartition); 7314 crc32(in_out3, Address(in_out2, 0), 4); 7315 crc32(tmp1, Address(in_out2, size), 4); 7316 crc32(tmp2, Address(in_out2, size*2), 4); 7317 crc32(in_out3, Address(in_out2, 0+4), 4); 7318 crc32(tmp1, Address(in_out2, size+4), 4); 7319 crc32(tmp2, Address(in_out2, size*2+4), 4); 7320 addl(in_out2, 8); 7321 cmpl(in_out2, tmp3); 7322 jcc(Assembler::less, L_processPartition); 7323 7324 push(tmp3); 7325 push(in_out1); 7326 push(in_out2); 7327 tmp4 = tmp3; 7328 tmp5 = in_out1; 7329 n_tmp6 = in_out2; 7330 7331 crc32c_rec_alt2(const_or_pre_comp_const_index_u1, const_or_pre_comp_const_index_u2, is_pclmulqdq_supported, in_out3, tmp1, tmp2, 7332 w_xtmp1, w_xtmp2, w_xtmp3, 7333 tmp4, tmp5, 7334 n_tmp6); 7335 7336 pop(in_out2); 7337 pop(in_out1); 7338 pop(tmp3); 7339 7340 addl(in_out2, 2 * size); 7341 subl(in_out1, 3 * size); 7342 jmp(L_processPartitions); 7343 7344 bind(L_exit); 7345 } 7346 #endif //LP64 7347 7348 #ifdef _LP64 7349 // Algorithm 2: Pipelined usage of the CRC32 instruction. 7350 // Input: A buffer I of L bytes. 7351 // Output: the CRC32C value of the buffer. 7352 // Notations: 7353 // Write L = 24N + r, with N = floor (L/24). 7354 // r = L mod 24 (0 <= r < 24). 7355 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 7356 // N quadwords, and R consists of r bytes. 7357 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 7358 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 7359 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 7360 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 7361 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 7362 Register tmp1, Register tmp2, Register tmp3, 7363 Register tmp4, Register tmp5, Register tmp6, 7364 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7365 bool is_pclmulqdq_supported) { 7366 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 7367 Label L_wordByWord; 7368 Label L_byteByByteProlog; 7369 Label L_byteByByte; 7370 Label L_exit; 7371 7372 if (is_pclmulqdq_supported ) { 7373 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 7374 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); 7375 7376 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 7377 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 7378 7379 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 7380 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 7381 assert((CRC32C_NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 7382 } else { 7383 const_or_pre_comp_const_index[0] = 1; 7384 const_or_pre_comp_const_index[1] = 0; 7385 7386 const_or_pre_comp_const_index[2] = 3; 7387 const_or_pre_comp_const_index[3] = 2; 7388 7389 const_or_pre_comp_const_index[4] = 5; 7390 const_or_pre_comp_const_index[5] = 4; 7391 } 7392 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 7393 in2, in1, in_out, 7394 tmp1, tmp2, tmp3, 7395 w_xtmp1, w_xtmp2, w_xtmp3, 7396 tmp4, tmp5, 7397 tmp6); 7398 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 7399 in2, in1, in_out, 7400 tmp1, tmp2, tmp3, 7401 w_xtmp1, w_xtmp2, w_xtmp3, 7402 tmp4, tmp5, 7403 tmp6); 7404 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 7405 in2, in1, in_out, 7406 tmp1, tmp2, tmp3, 7407 w_xtmp1, w_xtmp2, w_xtmp3, 7408 tmp4, tmp5, 7409 tmp6); 7410 movl(tmp1, in2); 7411 andl(tmp1, 0x00000007); 7412 negl(tmp1); 7413 addl(tmp1, in2); 7414 addq(tmp1, in1); 7415 7416 BIND(L_wordByWord); 7417 cmpq(in1, tmp1); 7418 jcc(Assembler::greaterEqual, L_byteByByteProlog); 7419 crc32(in_out, Address(in1, 0), 4); 7420 addq(in1, 4); 7421 jmp(L_wordByWord); 7422 7423 BIND(L_byteByByteProlog); 7424 andl(in2, 0x00000007); 7425 movl(tmp2, 1); 7426 7427 BIND(L_byteByByte); 7428 cmpl(tmp2, in2); 7429 jccb(Assembler::greater, L_exit); 7430 crc32(in_out, Address(in1, 0), 1); 7431 incq(in1); 7432 incl(tmp2); 7433 jmp(L_byteByByte); 7434 7435 BIND(L_exit); 7436 } 7437 #else 7438 void MacroAssembler::crc32c_ipl_alg2_alt2(Register in_out, Register in1, Register in2, 7439 Register tmp1, Register tmp2, Register tmp3, 7440 Register tmp4, Register tmp5, Register tmp6, 7441 XMMRegister w_xtmp1, XMMRegister w_xtmp2, XMMRegister w_xtmp3, 7442 bool is_pclmulqdq_supported) { 7443 uint32_t const_or_pre_comp_const_index[CRC32C_NUM_PRECOMPUTED_CONSTANTS]; 7444 Label L_wordByWord; 7445 Label L_byteByByteProlog; 7446 Label L_byteByByte; 7447 Label L_exit; 7448 7449 if (is_pclmulqdq_supported) { 7450 const_or_pre_comp_const_index[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 7451 const_or_pre_comp_const_index[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); 7452 7453 const_or_pre_comp_const_index[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 7454 const_or_pre_comp_const_index[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 7455 7456 const_or_pre_comp_const_index[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 7457 const_or_pre_comp_const_index[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 7458 } else { 7459 const_or_pre_comp_const_index[0] = 1; 7460 const_or_pre_comp_const_index[1] = 0; 7461 7462 const_or_pre_comp_const_index[2] = 3; 7463 const_or_pre_comp_const_index[3] = 2; 7464 7465 const_or_pre_comp_const_index[4] = 5; 7466 const_or_pre_comp_const_index[5] = 4; 7467 } 7468 crc32c_proc_chunk(CRC32C_HIGH, const_or_pre_comp_const_index[0], const_or_pre_comp_const_index[1], is_pclmulqdq_supported, 7469 in2, in1, in_out, 7470 tmp1, tmp2, tmp3, 7471 w_xtmp1, w_xtmp2, w_xtmp3, 7472 tmp4, tmp5, 7473 tmp6); 7474 crc32c_proc_chunk(CRC32C_MIDDLE, const_or_pre_comp_const_index[2], const_or_pre_comp_const_index[3], is_pclmulqdq_supported, 7475 in2, in1, in_out, 7476 tmp1, tmp2, tmp3, 7477 w_xtmp1, w_xtmp2, w_xtmp3, 7478 tmp4, tmp5, 7479 tmp6); 7480 crc32c_proc_chunk(CRC32C_LOW, const_or_pre_comp_const_index[4], const_or_pre_comp_const_index[5], is_pclmulqdq_supported, 7481 in2, in1, in_out, 7482 tmp1, tmp2, tmp3, 7483 w_xtmp1, w_xtmp2, w_xtmp3, 7484 tmp4, tmp5, 7485 tmp6); 7486 movl(tmp1, in2); 7487 andl(tmp1, 0x00000007); 7488 negl(tmp1); 7489 addl(tmp1, in2); 7490 addl(tmp1, in1); 7491 7492 BIND(L_wordByWord); 7493 cmpl(in1, tmp1); 7494 jcc(Assembler::greaterEqual, L_byteByByteProlog); 7495 crc32(in_out, Address(in1,0), 4); 7496 addl(in1, 4); 7497 jmp(L_wordByWord); 7498 7499 BIND(L_byteByByteProlog); 7500 andl(in2, 0x00000007); 7501 movl(tmp2, 1); 7502 7503 BIND(L_byteByByte); 7504 cmpl(tmp2, in2); 7505 jccb(Assembler::greater, L_exit); 7506 movb(tmp1, Address(in1, 0)); 7507 crc32(in_out, tmp1, 1); 7508 incl(in1); 7509 incl(tmp2); 7510 jmp(L_byteByByte); 7511 7512 BIND(L_exit); 7513 } 7514 #endif // LP64 7515 #undef BIND 7516 #undef BLOCK_COMMENT 7517 7518 // Compress char[] array to byte[]. 7519 // ..\jdk\src\java.base\share\classes\java\lang\StringUTF16.java 7520 // @HotSpotIntrinsicCandidate 7521 // private static int compress(char[] src, int srcOff, byte[] dst, int dstOff, int len) { 7522 // for (int i = 0; i < len; i++) { 7523 // int c = src[srcOff++]; 7524 // if (c >>> 8 != 0) { 7525 // return 0; 7526 // } 7527 // dst[dstOff++] = (byte)c; 7528 // } 7529 // return len; 7530 // } 7531 void MacroAssembler::char_array_compress(Register src, Register dst, Register len, 7532 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 7533 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 7534 Register tmp5, Register result) { 7535 Label copy_chars_loop, return_length, return_zero, done; 7536 7537 // rsi: src 7538 // rdi: dst 7539 // rdx: len 7540 // rcx: tmp5 7541 // rax: result 7542 7543 // rsi holds start addr of source char[] to be compressed 7544 // rdi holds start addr of destination byte[] 7545 // rdx holds length 7546 7547 assert(len != result, ""); 7548 7549 // save length for return 7550 push(len); 7551 7552 if ((AVX3Threshold == 0) && (UseAVX > 2) && // AVX512 7553 VM_Version::supports_avx512vlbw() && 7554 VM_Version::supports_bmi2()) { 7555 7556 Label copy_32_loop, copy_loop_tail, below_threshold; 7557 7558 // alignment 7559 Label post_alignment; 7560 7561 // if length of the string is less than 16, handle it in an old fashioned way 7562 testl(len, -32); 7563 jcc(Assembler::zero, below_threshold); 7564 7565 // First check whether a character is compressable ( <= 0xFF). 7566 // Create mask to test for Unicode chars inside zmm vector 7567 movl(result, 0x00FF); 7568 evpbroadcastw(tmp2Reg, result, Assembler::AVX_512bit); 7569 7570 testl(len, -64); 7571 jcc(Assembler::zero, post_alignment); 7572 7573 movl(tmp5, dst); 7574 andl(tmp5, (32 - 1)); 7575 negl(tmp5); 7576 andl(tmp5, (32 - 1)); 7577 7578 // bail out when there is nothing to be done 7579 testl(tmp5, 0xFFFFFFFF); 7580 jcc(Assembler::zero, post_alignment); 7581 7582 // ~(~0 << len), where len is the # of remaining elements to process 7583 movl(result, 0xFFFFFFFF); 7584 shlxl(result, result, tmp5); 7585 notl(result); 7586 kmovdl(k3, result); 7587 7588 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); 7589 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 7590 ktestd(k2, k3); 7591 jcc(Assembler::carryClear, return_zero); 7592 7593 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); 7594 7595 addptr(src, tmp5); 7596 addptr(src, tmp5); 7597 addptr(dst, tmp5); 7598 subl(len, tmp5); 7599 7600 bind(post_alignment); 7601 // end of alignment 7602 7603 movl(tmp5, len); 7604 andl(tmp5, (32 - 1)); // tail count (in chars) 7605 andl(len, ~(32 - 1)); // vector count (in chars) 7606 jcc(Assembler::zero, copy_loop_tail); 7607 7608 lea(src, Address(src, len, Address::times_2)); 7609 lea(dst, Address(dst, len, Address::times_1)); 7610 negptr(len); 7611 7612 bind(copy_32_loop); 7613 evmovdquw(tmp1Reg, Address(src, len, Address::times_2), Assembler::AVX_512bit); 7614 evpcmpuw(k2, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 7615 kortestdl(k2, k2); 7616 jcc(Assembler::carryClear, return_zero); 7617 7618 // All elements in current processed chunk are valid candidates for 7619 // compression. Write a truncated byte elements to the memory. 7620 evpmovwb(Address(dst, len, Address::times_1), tmp1Reg, Assembler::AVX_512bit); 7621 addptr(len, 32); 7622 jcc(Assembler::notZero, copy_32_loop); 7623 7624 bind(copy_loop_tail); 7625 // bail out when there is nothing to be done 7626 testl(tmp5, 0xFFFFFFFF); 7627 jcc(Assembler::zero, return_length); 7628 7629 movl(len, tmp5); 7630 7631 // ~(~0 << len), where len is the # of remaining elements to process 7632 movl(result, 0xFFFFFFFF); 7633 shlxl(result, result, len); 7634 notl(result); 7635 7636 kmovdl(k3, result); 7637 7638 evmovdquw(tmp1Reg, k3, Address(src, 0), Assembler::AVX_512bit); 7639 evpcmpuw(k2, k3, tmp1Reg, tmp2Reg, Assembler::le, Assembler::AVX_512bit); 7640 ktestd(k2, k3); 7641 jcc(Assembler::carryClear, return_zero); 7642 7643 evpmovwb(Address(dst, 0), k3, tmp1Reg, Assembler::AVX_512bit); 7644 jmp(return_length); 7645 7646 bind(below_threshold); 7647 } 7648 7649 if (UseSSE42Intrinsics) { 7650 Label copy_32_loop, copy_16, copy_tail; 7651 7652 movl(result, len); 7653 7654 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vectors 7655 7656 // vectored compression 7657 andl(len, 0xfffffff0); // vector count (in chars) 7658 andl(result, 0x0000000f); // tail count (in chars) 7659 testl(len, len); 7660 jcc(Assembler::zero, copy_16); 7661 7662 // compress 16 chars per iter 7663 movdl(tmp1Reg, tmp5); 7664 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 7665 pxor(tmp4Reg, tmp4Reg); 7666 7667 lea(src, Address(src, len, Address::times_2)); 7668 lea(dst, Address(dst, len, Address::times_1)); 7669 negptr(len); 7670 7671 bind(copy_32_loop); 7672 movdqu(tmp2Reg, Address(src, len, Address::times_2)); // load 1st 8 characters 7673 por(tmp4Reg, tmp2Reg); 7674 movdqu(tmp3Reg, Address(src, len, Address::times_2, 16)); // load next 8 characters 7675 por(tmp4Reg, tmp3Reg); 7676 ptest(tmp4Reg, tmp1Reg); // check for Unicode chars in next vector 7677 jcc(Assembler::notZero, return_zero); 7678 packuswb(tmp2Reg, tmp3Reg); // only ASCII chars; compress each to 1 byte 7679 movdqu(Address(dst, len, Address::times_1), tmp2Reg); 7680 addptr(len, 16); 7681 jcc(Assembler::notZero, copy_32_loop); 7682 7683 // compress next vector of 8 chars (if any) 7684 bind(copy_16); 7685 movl(len, result); 7686 andl(len, 0xfffffff8); // vector count (in chars) 7687 andl(result, 0x00000007); // tail count (in chars) 7688 testl(len, len); 7689 jccb(Assembler::zero, copy_tail); 7690 7691 movdl(tmp1Reg, tmp5); 7692 pshufd(tmp1Reg, tmp1Reg, 0); // store Unicode mask in tmp1Reg 7693 pxor(tmp3Reg, tmp3Reg); 7694 7695 movdqu(tmp2Reg, Address(src, 0)); 7696 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 7697 jccb(Assembler::notZero, return_zero); 7698 packuswb(tmp2Reg, tmp3Reg); // only LATIN1 chars; compress each to 1 byte 7699 movq(Address(dst, 0), tmp2Reg); 7700 addptr(src, 16); 7701 addptr(dst, 8); 7702 7703 bind(copy_tail); 7704 movl(len, result); 7705 } 7706 // compress 1 char per iter 7707 testl(len, len); 7708 jccb(Assembler::zero, return_length); 7709 lea(src, Address(src, len, Address::times_2)); 7710 lea(dst, Address(dst, len, Address::times_1)); 7711 negptr(len); 7712 7713 bind(copy_chars_loop); 7714 load_unsigned_short(result, Address(src, len, Address::times_2)); 7715 testl(result, 0xff00); // check if Unicode char 7716 jccb(Assembler::notZero, return_zero); 7717 movb(Address(dst, len, Address::times_1), result); // ASCII char; compress to 1 byte 7718 increment(len); 7719 jcc(Assembler::notZero, copy_chars_loop); 7720 7721 // if compression succeeded, return length 7722 bind(return_length); 7723 pop(result); 7724 jmpb(done); 7725 7726 // if compression failed, return 0 7727 bind(return_zero); 7728 xorl(result, result); 7729 addptr(rsp, wordSize); 7730 7731 bind(done); 7732 } 7733 7734 // Inflate byte[] array to char[]. 7735 // ..\jdk\src\java.base\share\classes\java\lang\StringLatin1.java 7736 // @HotSpotIntrinsicCandidate 7737 // private static void inflate(byte[] src, int srcOff, char[] dst, int dstOff, int len) { 7738 // for (int i = 0; i < len; i++) { 7739 // dst[dstOff++] = (char)(src[srcOff++] & 0xff); 7740 // } 7741 // } 7742 void MacroAssembler::byte_array_inflate(Register src, Register dst, Register len, 7743 XMMRegister tmp1, Register tmp2) { 7744 Label copy_chars_loop, done, below_threshold, avx3_threshold; 7745 // rsi: src 7746 // rdi: dst 7747 // rdx: len 7748 // rcx: tmp2 7749 7750 // rsi holds start addr of source byte[] to be inflated 7751 // rdi holds start addr of destination char[] 7752 // rdx holds length 7753 assert_different_registers(src, dst, len, tmp2); 7754 movl(tmp2, len); 7755 if ((UseAVX > 2) && // AVX512 7756 VM_Version::supports_avx512vlbw() && 7757 VM_Version::supports_bmi2()) { 7758 7759 Label copy_32_loop, copy_tail; 7760 Register tmp3_aliased = len; 7761 7762 // if length of the string is less than 16, handle it in an old fashioned way 7763 testl(len, -16); 7764 jcc(Assembler::zero, below_threshold); 7765 7766 testl(len, -1 * AVX3Threshold); 7767 jcc(Assembler::zero, avx3_threshold); 7768 7769 // In order to use only one arithmetic operation for the main loop we use 7770 // this pre-calculation 7771 andl(tmp2, (32 - 1)); // tail count (in chars), 32 element wide loop 7772 andl(len, -32); // vector count 7773 jccb(Assembler::zero, copy_tail); 7774 7775 lea(src, Address(src, len, Address::times_1)); 7776 lea(dst, Address(dst, len, Address::times_2)); 7777 negptr(len); 7778 7779 7780 // inflate 32 chars per iter 7781 bind(copy_32_loop); 7782 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_512bit); 7783 evmovdquw(Address(dst, len, Address::times_2), tmp1, Assembler::AVX_512bit); 7784 addptr(len, 32); 7785 jcc(Assembler::notZero, copy_32_loop); 7786 7787 bind(copy_tail); 7788 // bail out when there is nothing to be done 7789 testl(tmp2, -1); // we don't destroy the contents of tmp2 here 7790 jcc(Assembler::zero, done); 7791 7792 // ~(~0 << length), where length is the # of remaining elements to process 7793 movl(tmp3_aliased, -1); 7794 shlxl(tmp3_aliased, tmp3_aliased, tmp2); 7795 notl(tmp3_aliased); 7796 kmovdl(k2, tmp3_aliased); 7797 evpmovzxbw(tmp1, k2, Address(src, 0), Assembler::AVX_512bit); 7798 evmovdquw(Address(dst, 0), k2, tmp1, Assembler::AVX_512bit); 7799 7800 jmp(done); 7801 bind(avx3_threshold); 7802 } 7803 if (UseSSE42Intrinsics) { 7804 Label copy_16_loop, copy_8_loop, copy_bytes, copy_new_tail, copy_tail; 7805 7806 if (UseAVX > 1) { 7807 andl(tmp2, (16 - 1)); 7808 andl(len, -16); 7809 jccb(Assembler::zero, copy_new_tail); 7810 } else { 7811 andl(tmp2, 0x00000007); // tail count (in chars) 7812 andl(len, 0xfffffff8); // vector count (in chars) 7813 jccb(Assembler::zero, copy_tail); 7814 } 7815 7816 // vectored inflation 7817 lea(src, Address(src, len, Address::times_1)); 7818 lea(dst, Address(dst, len, Address::times_2)); 7819 negptr(len); 7820 7821 if (UseAVX > 1) { 7822 bind(copy_16_loop); 7823 vpmovzxbw(tmp1, Address(src, len, Address::times_1), Assembler::AVX_256bit); 7824 vmovdqu(Address(dst, len, Address::times_2), tmp1); 7825 addptr(len, 16); 7826 jcc(Assembler::notZero, copy_16_loop); 7827 7828 bind(below_threshold); 7829 bind(copy_new_tail); 7830 movl(len, tmp2); 7831 andl(tmp2, 0x00000007); 7832 andl(len, 0xFFFFFFF8); 7833 jccb(Assembler::zero, copy_tail); 7834 7835 pmovzxbw(tmp1, Address(src, 0)); 7836 movdqu(Address(dst, 0), tmp1); 7837 addptr(src, 8); 7838 addptr(dst, 2 * 8); 7839 7840 jmp(copy_tail, true); 7841 } 7842 7843 // inflate 8 chars per iter 7844 bind(copy_8_loop); 7845 pmovzxbw(tmp1, Address(src, len, Address::times_1)); // unpack to 8 words 7846 movdqu(Address(dst, len, Address::times_2), tmp1); 7847 addptr(len, 8); 7848 jcc(Assembler::notZero, copy_8_loop); 7849 7850 bind(copy_tail); 7851 movl(len, tmp2); 7852 7853 cmpl(len, 4); 7854 jccb(Assembler::less, copy_bytes); 7855 7856 movdl(tmp1, Address(src, 0)); // load 4 byte chars 7857 pmovzxbw(tmp1, tmp1); 7858 movq(Address(dst, 0), tmp1); 7859 subptr(len, 4); 7860 addptr(src, 4); 7861 addptr(dst, 8); 7862 7863 bind(copy_bytes); 7864 } else { 7865 bind(below_threshold); 7866 } 7867 7868 testl(len, len); 7869 jccb(Assembler::zero, done); 7870 lea(src, Address(src, len, Address::times_1)); 7871 lea(dst, Address(dst, len, Address::times_2)); 7872 negptr(len); 7873 7874 // inflate 1 char per iter 7875 bind(copy_chars_loop); 7876 load_unsigned_byte(tmp2, Address(src, len, Address::times_1)); // load byte char 7877 movw(Address(dst, len, Address::times_2), tmp2); // inflate byte char to word 7878 increment(len); 7879 jcc(Assembler::notZero, copy_chars_loop); 7880 7881 bind(done); 7882 } 7883 7884 #ifdef _LP64 7885 void MacroAssembler::convert_f2i(Register dst, XMMRegister src) { 7886 Label done; 7887 cvttss2sil(dst, src); 7888 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 7889 cmpl(dst, 0x80000000); // float_sign_flip 7890 jccb(Assembler::notEqual, done); 7891 subptr(rsp, 8); 7892 movflt(Address(rsp, 0), src); 7893 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2i_fixup()))); 7894 pop(dst); 7895 bind(done); 7896 } 7897 7898 void MacroAssembler::convert_d2i(Register dst, XMMRegister src) { 7899 Label done; 7900 cvttsd2sil(dst, src); 7901 // Conversion instructions do not match JLS for overflow, underflow and NaN -> fixup in stub 7902 cmpl(dst, 0x80000000); // float_sign_flip 7903 jccb(Assembler::notEqual, done); 7904 subptr(rsp, 8); 7905 movdbl(Address(rsp, 0), src); 7906 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2i_fixup()))); 7907 pop(dst); 7908 bind(done); 7909 } 7910 7911 void MacroAssembler::convert_f2l(Register dst, XMMRegister src) { 7912 Label done; 7913 cvttss2siq(dst, src); 7914 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 7915 jccb(Assembler::notEqual, done); 7916 subptr(rsp, 8); 7917 movflt(Address(rsp, 0), src); 7918 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::f2l_fixup()))); 7919 pop(dst); 7920 bind(done); 7921 } 7922 7923 void MacroAssembler::convert_d2l(Register dst, XMMRegister src) { 7924 Label done; 7925 cvttsd2siq(dst, src); 7926 cmp64(dst, ExternalAddress((address) StubRoutines::x86::double_sign_flip())); 7927 jccb(Assembler::notEqual, done); 7928 subptr(rsp, 8); 7929 movdbl(Address(rsp, 0), src); 7930 call(RuntimeAddress(CAST_FROM_FN_PTR(address, StubRoutines::x86::d2l_fixup()))); 7931 pop(dst); 7932 bind(done); 7933 } 7934 7935 void MacroAssembler::cache_wb(Address line) 7936 { 7937 // 64 bit cpus always support clflush 7938 assert(VM_Version::supports_clflush(), "clflush should be available"); 7939 bool optimized = VM_Version::supports_clflushopt(); 7940 bool no_evict = VM_Version::supports_clwb(); 7941 7942 // prefer clwb (writeback without evict) otherwise 7943 // prefer clflushopt (potentially parallel writeback with evict) 7944 // otherwise fallback on clflush (serial writeback with evict) 7945 7946 if (optimized) { 7947 if (no_evict) { 7948 clwb(line); 7949 } else { 7950 clflushopt(line); 7951 } 7952 } else { 7953 // no need for fence when using CLFLUSH 7954 clflush(line); 7955 } 7956 } 7957 7958 void MacroAssembler::cache_wbsync(bool is_pre) 7959 { 7960 assert(VM_Version::supports_clflush(), "clflush should be available"); 7961 bool optimized = VM_Version::supports_clflushopt(); 7962 bool no_evict = VM_Version::supports_clwb(); 7963 7964 // pick the correct implementation 7965 7966 if (!is_pre && (optimized || no_evict)) { 7967 // need an sfence for post flush when using clflushopt or clwb 7968 // otherwise no no need for any synchroniaztion 7969 7970 sfence(); 7971 } 7972 } 7973 #endif // _LP64 7974 7975 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 7976 switch (cond) { 7977 // Note some conditions are synonyms for others 7978 case Assembler::zero: return Assembler::notZero; 7979 case Assembler::notZero: return Assembler::zero; 7980 case Assembler::less: return Assembler::greaterEqual; 7981 case Assembler::lessEqual: return Assembler::greater; 7982 case Assembler::greater: return Assembler::lessEqual; 7983 case Assembler::greaterEqual: return Assembler::less; 7984 case Assembler::below: return Assembler::aboveEqual; 7985 case Assembler::belowEqual: return Assembler::above; 7986 case Assembler::above: return Assembler::belowEqual; 7987 case Assembler::aboveEqual: return Assembler::below; 7988 case Assembler::overflow: return Assembler::noOverflow; 7989 case Assembler::noOverflow: return Assembler::overflow; 7990 case Assembler::negative: return Assembler::positive; 7991 case Assembler::positive: return Assembler::negative; 7992 case Assembler::parity: return Assembler::noParity; 7993 case Assembler::noParity: return Assembler::parity; 7994 } 7995 ShouldNotReachHere(); return Assembler::overflow; 7996 } 7997 7998 SkipIfEqual::SkipIfEqual( 7999 MacroAssembler* masm, const bool* flag_addr, bool value) { 8000 _masm = masm; 8001 _masm->cmp8(ExternalAddress((address)flag_addr), value); 8002 _masm->jcc(Assembler::equal, _label); 8003 } 8004 8005 SkipIfEqual::~SkipIfEqual() { 8006 _masm->bind(_label); 8007 } 8008 8009 // 32-bit Windows has its own fast-path implementation 8010 // of get_thread 8011 #if !defined(WIN32) || defined(_LP64) 8012 8013 // This is simply a call to Thread::current() 8014 void MacroAssembler::get_thread(Register thread) { 8015 if (thread != rax) { 8016 push(rax); 8017 } 8018 LP64_ONLY(push(rdi);) 8019 LP64_ONLY(push(rsi);) 8020 push(rdx); 8021 push(rcx); 8022 #ifdef _LP64 8023 push(r8); 8024 push(r9); 8025 push(r10); 8026 push(r11); 8027 #endif 8028 8029 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, Thread::current), 0); 8030 8031 #ifdef _LP64 8032 pop(r11); 8033 pop(r10); 8034 pop(r9); 8035 pop(r8); 8036 #endif 8037 pop(rcx); 8038 pop(rdx); 8039 LP64_ONLY(pop(rsi);) 8040 LP64_ONLY(pop(rdi);) 8041 if (thread != rax) { 8042 mov(thread, rax); 8043 pop(rax); 8044 } 8045 } 8046 8047 #endif // !WIN32 || _LP64