1 /* 2 * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "asm/assembler.inline.hpp" 28 #include "compiler/disassembler.hpp" 29 #include "gc/shared/cardTableModRefBS.hpp" 30 #include "gc/shared/collectedHeap.inline.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/resourceArea.hpp" 33 #include "memory/universe.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/biasedLocking.hpp" 36 #include "runtime/interfaceSupport.hpp" 37 #include "runtime/objectMonitor.hpp" 38 #include "runtime/os.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "utilities/macros.hpp" 42 #if INCLUDE_ALL_GCS 43 #include "gc/g1/g1CollectedHeap.inline.hpp" 44 #include "gc/g1/g1SATBCardTableModRefBS.hpp" 45 #include "gc/g1/heapRegion.hpp" 46 #endif // INCLUDE_ALL_GCS 47 48 #ifdef PRODUCT 49 #define BLOCK_COMMENT(str) /* nothing */ 50 #define STOP(error) stop(error) 51 #else 52 #define BLOCK_COMMENT(str) block_comment(str) 53 #define STOP(error) block_comment(error); stop(error) 54 #endif 55 56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 57 58 PRAGMA_FORMAT_MUTE_WARNINGS_FOR_GCC 59 60 #ifdef ASSERT 61 bool AbstractAssembler::pd_check_instruction_mark() { return true; } 62 #endif 63 64 static Assembler::Condition reverse[] = { 65 Assembler::noOverflow /* overflow = 0x0 */ , 66 Assembler::overflow /* noOverflow = 0x1 */ , 67 Assembler::aboveEqual /* carrySet = 0x2, below = 0x2 */ , 68 Assembler::below /* aboveEqual = 0x3, carryClear = 0x3 */ , 69 Assembler::notZero /* zero = 0x4, equal = 0x4 */ , 70 Assembler::zero /* notZero = 0x5, notEqual = 0x5 */ , 71 Assembler::above /* belowEqual = 0x6 */ , 72 Assembler::belowEqual /* above = 0x7 */ , 73 Assembler::positive /* negative = 0x8 */ , 74 Assembler::negative /* positive = 0x9 */ , 75 Assembler::noParity /* parity = 0xa */ , 76 Assembler::parity /* noParity = 0xb */ , 77 Assembler::greaterEqual /* less = 0xc */ , 78 Assembler::less /* greaterEqual = 0xd */ , 79 Assembler::greater /* lessEqual = 0xe */ , 80 Assembler::lessEqual /* greater = 0xf, */ 81 82 }; 83 84 85 // Implementation of MacroAssembler 86 87 // First all the versions that have distinct versions depending on 32/64 bit 88 // Unless the difference is trivial (1 line or so). 89 90 #ifndef _LP64 91 92 // 32bit versions 93 94 Address MacroAssembler::as_Address(AddressLiteral adr) { 95 return Address(adr.target(), adr.rspec()); 96 } 97 98 Address MacroAssembler::as_Address(ArrayAddress adr) { 99 return Address::make_array(adr); 100 } 101 102 void MacroAssembler::call_VM_leaf_base(address entry_point, 103 int number_of_arguments) { 104 call(RuntimeAddress(entry_point)); 105 increment(rsp, number_of_arguments * wordSize); 106 } 107 108 void MacroAssembler::cmpklass(Address src1, Metadata* obj) { 109 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 110 } 111 112 void MacroAssembler::cmpklass(Register src1, Metadata* obj) { 113 cmp_literal32(src1, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 114 } 115 116 void MacroAssembler::cmpoop(Address src1, jobject obj) { 117 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 118 } 119 120 void MacroAssembler::cmpoop(Register src1, jobject obj) { 121 cmp_literal32(src1, (int32_t)obj, oop_Relocation::spec_for_immediate()); 122 } 123 124 void MacroAssembler::extend_sign(Register hi, Register lo) { 125 // According to Intel Doc. AP-526, "Integer Divide", p.18. 126 if (VM_Version::is_P6() && hi == rdx && lo == rax) { 127 cdql(); 128 } else { 129 movl(hi, lo); 130 sarl(hi, 31); 131 } 132 } 133 134 void MacroAssembler::jC2(Register tmp, Label& L) { 135 // set parity bit if FPU flag C2 is set (via rax) 136 save_rax(tmp); 137 fwait(); fnstsw_ax(); 138 sahf(); 139 restore_rax(tmp); 140 // branch 141 jcc(Assembler::parity, L); 142 } 143 144 void MacroAssembler::jnC2(Register tmp, Label& L) { 145 // set parity bit if FPU flag C2 is set (via rax) 146 save_rax(tmp); 147 fwait(); fnstsw_ax(); 148 sahf(); 149 restore_rax(tmp); 150 // branch 151 jcc(Assembler::noParity, L); 152 } 153 154 // 32bit can do a case table jump in one instruction but we no longer allow the base 155 // to be installed in the Address class 156 void MacroAssembler::jump(ArrayAddress entry) { 157 jmp(as_Address(entry)); 158 } 159 160 // Note: y_lo will be destroyed 161 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 162 // Long compare for Java (semantics as described in JVM spec.) 163 Label high, low, done; 164 165 cmpl(x_hi, y_hi); 166 jcc(Assembler::less, low); 167 jcc(Assembler::greater, high); 168 // x_hi is the return register 169 xorl(x_hi, x_hi); 170 cmpl(x_lo, y_lo); 171 jcc(Assembler::below, low); 172 jcc(Assembler::equal, done); 173 174 bind(high); 175 xorl(x_hi, x_hi); 176 increment(x_hi); 177 jmp(done); 178 179 bind(low); 180 xorl(x_hi, x_hi); 181 decrementl(x_hi); 182 183 bind(done); 184 } 185 186 void MacroAssembler::lea(Register dst, AddressLiteral src) { 187 mov_literal32(dst, (int32_t)src.target(), src.rspec()); 188 } 189 190 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 191 // leal(dst, as_Address(adr)); 192 // see note in movl as to why we must use a move 193 mov_literal32(dst, (int32_t) adr.target(), adr.rspec()); 194 } 195 196 void MacroAssembler::leave() { 197 mov(rsp, rbp); 198 pop(rbp); 199 } 200 201 void MacroAssembler::lmul(int x_rsp_offset, int y_rsp_offset) { 202 // Multiplication of two Java long values stored on the stack 203 // as illustrated below. Result is in rdx:rax. 204 // 205 // rsp ---> [ ?? ] \ \ 206 // .... | y_rsp_offset | 207 // [ y_lo ] / (in bytes) | x_rsp_offset 208 // [ y_hi ] | (in bytes) 209 // .... | 210 // [ x_lo ] / 211 // [ x_hi ] 212 // .... 213 // 214 // Basic idea: lo(result) = lo(x_lo * y_lo) 215 // hi(result) = hi(x_lo * y_lo) + lo(x_hi * y_lo) + lo(x_lo * y_hi) 216 Address x_hi(rsp, x_rsp_offset + wordSize); Address x_lo(rsp, x_rsp_offset); 217 Address y_hi(rsp, y_rsp_offset + wordSize); Address y_lo(rsp, y_rsp_offset); 218 Label quick; 219 // load x_hi, y_hi and check if quick 220 // multiplication is possible 221 movl(rbx, x_hi); 222 movl(rcx, y_hi); 223 movl(rax, rbx); 224 orl(rbx, rcx); // rbx, = 0 <=> x_hi = 0 and y_hi = 0 225 jcc(Assembler::zero, quick); // if rbx, = 0 do quick multiply 226 // do full multiplication 227 // 1st step 228 mull(y_lo); // x_hi * y_lo 229 movl(rbx, rax); // save lo(x_hi * y_lo) in rbx, 230 // 2nd step 231 movl(rax, x_lo); 232 mull(rcx); // x_lo * y_hi 233 addl(rbx, rax); // add lo(x_lo * y_hi) to rbx, 234 // 3rd step 235 bind(quick); // note: rbx, = 0 if quick multiply! 236 movl(rax, x_lo); 237 mull(y_lo); // x_lo * y_lo 238 addl(rdx, rbx); // correct hi(x_lo * y_lo) 239 } 240 241 void MacroAssembler::lneg(Register hi, Register lo) { 242 negl(lo); 243 adcl(hi, 0); 244 negl(hi); 245 } 246 247 void MacroAssembler::lshl(Register hi, Register lo) { 248 // Java shift left long support (semantics as described in JVM spec., p.305) 249 // (basic idea for shift counts s >= n: x << s == (x << n) << (s - n)) 250 // shift value is in rcx ! 251 assert(hi != rcx, "must not use rcx"); 252 assert(lo != rcx, "must not use rcx"); 253 const Register s = rcx; // shift count 254 const int n = BitsPerWord; 255 Label L; 256 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 257 cmpl(s, n); // if (s < n) 258 jcc(Assembler::less, L); // else (s >= n) 259 movl(hi, lo); // x := x << n 260 xorl(lo, lo); 261 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 262 bind(L); // s (mod n) < n 263 shldl(hi, lo); // x := x << s 264 shll(lo); 265 } 266 267 268 void MacroAssembler::lshr(Register hi, Register lo, bool sign_extension) { 269 // Java shift right long support (semantics as described in JVM spec., p.306 & p.310) 270 // (basic idea for shift counts s >= n: x >> s == (x >> n) >> (s - n)) 271 assert(hi != rcx, "must not use rcx"); 272 assert(lo != rcx, "must not use rcx"); 273 const Register s = rcx; // shift count 274 const int n = BitsPerWord; 275 Label L; 276 andl(s, 0x3f); // s := s & 0x3f (s < 0x40) 277 cmpl(s, n); // if (s < n) 278 jcc(Assembler::less, L); // else (s >= n) 279 movl(lo, hi); // x := x >> n 280 if (sign_extension) sarl(hi, 31); 281 else xorl(hi, hi); 282 // Note: subl(s, n) is not needed since the Intel shift instructions work rcx mod n! 283 bind(L); // s (mod n) < n 284 shrdl(lo, hi); // x := x >> s 285 if (sign_extension) sarl(hi); 286 else shrl(hi); 287 } 288 289 void MacroAssembler::movoop(Register dst, jobject obj) { 290 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 291 } 292 293 void MacroAssembler::movoop(Address dst, jobject obj) { 294 mov_literal32(dst, (int32_t)obj, oop_Relocation::spec_for_immediate()); 295 } 296 297 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 298 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 299 } 300 301 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 302 mov_literal32(dst, (int32_t)obj, metadata_Relocation::spec_for_immediate()); 303 } 304 305 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 306 // scratch register is not used, 307 // it is defined to match parameters of 64-bit version of this method. 308 if (src.is_lval()) { 309 mov_literal32(dst, (intptr_t)src.target(), src.rspec()); 310 } else { 311 movl(dst, as_Address(src)); 312 } 313 } 314 315 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 316 movl(as_Address(dst), src); 317 } 318 319 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 320 movl(dst, as_Address(src)); 321 } 322 323 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 324 void MacroAssembler::movptr(Address dst, intptr_t src) { 325 movl(dst, src); 326 } 327 328 329 void MacroAssembler::pop_callee_saved_registers() { 330 pop(rcx); 331 pop(rdx); 332 pop(rdi); 333 pop(rsi); 334 } 335 336 void MacroAssembler::pop_fTOS() { 337 fld_d(Address(rsp, 0)); 338 addl(rsp, 2 * wordSize); 339 } 340 341 void MacroAssembler::push_callee_saved_registers() { 342 push(rsi); 343 push(rdi); 344 push(rdx); 345 push(rcx); 346 } 347 348 void MacroAssembler::push_fTOS() { 349 subl(rsp, 2 * wordSize); 350 fstp_d(Address(rsp, 0)); 351 } 352 353 354 void MacroAssembler::pushoop(jobject obj) { 355 push_literal32((int32_t)obj, oop_Relocation::spec_for_immediate()); 356 } 357 358 void MacroAssembler::pushklass(Metadata* obj) { 359 push_literal32((int32_t)obj, metadata_Relocation::spec_for_immediate()); 360 } 361 362 void MacroAssembler::pushptr(AddressLiteral src) { 363 if (src.is_lval()) { 364 push_literal32((int32_t)src.target(), src.rspec()); 365 } else { 366 pushl(as_Address(src)); 367 } 368 } 369 370 void MacroAssembler::set_word_if_not_zero(Register dst) { 371 xorl(dst, dst); 372 set_byte_if_not_zero(dst); 373 } 374 375 static void pass_arg0(MacroAssembler* masm, Register arg) { 376 masm->push(arg); 377 } 378 379 static void pass_arg1(MacroAssembler* masm, Register arg) { 380 masm->push(arg); 381 } 382 383 static void pass_arg2(MacroAssembler* masm, Register arg) { 384 masm->push(arg); 385 } 386 387 static void pass_arg3(MacroAssembler* masm, Register arg) { 388 masm->push(arg); 389 } 390 391 #ifndef PRODUCT 392 extern "C" void findpc(intptr_t x); 393 #endif 394 395 void MacroAssembler::debug32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip, char* msg) { 396 // In order to get locks to work, we need to fake a in_VM state 397 JavaThread* thread = JavaThread::current(); 398 JavaThreadState saved_state = thread->thread_state(); 399 thread->set_thread_state(_thread_in_vm); 400 if (ShowMessageBoxOnError) { 401 JavaThread* thread = JavaThread::current(); 402 JavaThreadState saved_state = thread->thread_state(); 403 thread->set_thread_state(_thread_in_vm); 404 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 405 ttyLocker ttyl; 406 BytecodeCounter::print(); 407 } 408 // To see where a verify_oop failed, get $ebx+40/X for this frame. 409 // This is the value of eip which points to where verify_oop will return. 410 if (os::message_box(msg, "Execution stopped, print registers?")) { 411 print_state32(rdi, rsi, rbp, rsp, rbx, rdx, rcx, rax, eip); 412 BREAKPOINT; 413 } 414 } else { 415 ttyLocker ttyl; 416 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg); 417 } 418 // Don't assert holding the ttyLock 419 assert(false, err_msg("DEBUG MESSAGE: %s", msg)); 420 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 421 } 422 423 void MacroAssembler::print_state32(int rdi, int rsi, int rbp, int rsp, int rbx, int rdx, int rcx, int rax, int eip) { 424 ttyLocker ttyl; 425 FlagSetting fs(Debugging, true); 426 tty->print_cr("eip = 0x%08x", eip); 427 #ifndef PRODUCT 428 if ((WizardMode || Verbose) && PrintMiscellaneous) { 429 tty->cr(); 430 findpc(eip); 431 tty->cr(); 432 } 433 #endif 434 #define PRINT_REG(rax) \ 435 { tty->print("%s = ", #rax); os::print_location(tty, rax); } 436 PRINT_REG(rax); 437 PRINT_REG(rbx); 438 PRINT_REG(rcx); 439 PRINT_REG(rdx); 440 PRINT_REG(rdi); 441 PRINT_REG(rsi); 442 PRINT_REG(rbp); 443 PRINT_REG(rsp); 444 #undef PRINT_REG 445 // Print some words near top of staack. 446 int* dump_sp = (int*) rsp; 447 for (int col1 = 0; col1 < 8; col1++) { 448 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 449 os::print_location(tty, *dump_sp++); 450 } 451 for (int row = 0; row < 16; row++) { 452 tty->print("(rsp+0x%03x) 0x%08x: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (intptr_t)dump_sp); 453 for (int col = 0; col < 8; col++) { 454 tty->print(" 0x%08x", *dump_sp++); 455 } 456 tty->cr(); 457 } 458 // Print some instructions around pc: 459 Disassembler::decode((address)eip-64, (address)eip); 460 tty->print_cr("--------"); 461 Disassembler::decode((address)eip, (address)eip+32); 462 } 463 464 void MacroAssembler::stop(const char* msg) { 465 ExternalAddress message((address)msg); 466 // push address of message 467 pushptr(message.addr()); 468 { Label L; call(L, relocInfo::none); bind(L); } // push eip 469 pusha(); // push registers 470 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug32))); 471 hlt(); 472 } 473 474 void MacroAssembler::warn(const char* msg) { 475 push_CPU_state(); 476 477 ExternalAddress message((address) msg); 478 // push address of message 479 pushptr(message.addr()); 480 481 call(RuntimeAddress(CAST_FROM_FN_PTR(address, warning))); 482 addl(rsp, wordSize); // discard argument 483 pop_CPU_state(); 484 } 485 486 void MacroAssembler::print_state() { 487 { Label L; call(L, relocInfo::none); bind(L); } // push eip 488 pusha(); // push registers 489 490 push_CPU_state(); 491 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::print_state32))); 492 pop_CPU_state(); 493 494 popa(); 495 addl(rsp, wordSize); 496 } 497 498 #else // _LP64 499 500 // 64 bit versions 501 502 Address MacroAssembler::as_Address(AddressLiteral adr) { 503 // amd64 always does this as a pc-rel 504 // we can be absolute or disp based on the instruction type 505 // jmp/call are displacements others are absolute 506 assert(!adr.is_lval(), "must be rval"); 507 assert(reachable(adr), "must be"); 508 return Address((int32_t)(intptr_t)(adr.target() - pc()), adr.target(), adr.reloc()); 509 510 } 511 512 Address MacroAssembler::as_Address(ArrayAddress adr) { 513 AddressLiteral base = adr.base(); 514 lea(rscratch1, base); 515 Address index = adr.index(); 516 assert(index._disp == 0, "must not have disp"); // maybe it can? 517 Address array(rscratch1, index._index, index._scale, index._disp); 518 return array; 519 } 520 521 void MacroAssembler::call_VM_leaf_base(address entry_point, int num_args) { 522 Label L, E; 523 524 #ifdef _WIN64 525 // Windows always allocates space for it's register args 526 assert(num_args <= 4, "only register arguments supported"); 527 subq(rsp, frame::arg_reg_save_area_bytes); 528 #endif 529 530 // Align stack if necessary 531 testl(rsp, 15); 532 jcc(Assembler::zero, L); 533 534 subq(rsp, 8); 535 { 536 call(RuntimeAddress(entry_point)); 537 } 538 addq(rsp, 8); 539 jmp(E); 540 541 bind(L); 542 { 543 call(RuntimeAddress(entry_point)); 544 } 545 546 bind(E); 547 548 #ifdef _WIN64 549 // restore stack pointer 550 addq(rsp, frame::arg_reg_save_area_bytes); 551 #endif 552 553 } 554 555 void MacroAssembler::cmp64(Register src1, AddressLiteral src2) { 556 assert(!src2.is_lval(), "should use cmpptr"); 557 558 if (reachable(src2)) { 559 cmpq(src1, as_Address(src2)); 560 } else { 561 lea(rscratch1, src2); 562 Assembler::cmpq(src1, Address(rscratch1, 0)); 563 } 564 } 565 566 int MacroAssembler::corrected_idivq(Register reg) { 567 // Full implementation of Java ldiv and lrem; checks for special 568 // case as described in JVM spec., p.243 & p.271. The function 569 // returns the (pc) offset of the idivl instruction - may be needed 570 // for implicit exceptions. 571 // 572 // normal case special case 573 // 574 // input : rax: dividend min_long 575 // reg: divisor (may not be eax/edx) -1 576 // 577 // output: rax: quotient (= rax idiv reg) min_long 578 // rdx: remainder (= rax irem reg) 0 579 assert(reg != rax && reg != rdx, "reg cannot be rax or rdx register"); 580 static const int64_t min_long = 0x8000000000000000; 581 Label normal_case, special_case; 582 583 // check for special case 584 cmp64(rax, ExternalAddress((address) &min_long)); 585 jcc(Assembler::notEqual, normal_case); 586 xorl(rdx, rdx); // prepare rdx for possible special case (where 587 // remainder = 0) 588 cmpq(reg, -1); 589 jcc(Assembler::equal, special_case); 590 591 // handle normal case 592 bind(normal_case); 593 cdqq(); 594 int idivq_offset = offset(); 595 idivq(reg); 596 597 // normal and special case exit 598 bind(special_case); 599 600 return idivq_offset; 601 } 602 603 void MacroAssembler::decrementq(Register reg, int value) { 604 if (value == min_jint) { subq(reg, value); return; } 605 if (value < 0) { incrementq(reg, -value); return; } 606 if (value == 0) { ; return; } 607 if (value == 1 && UseIncDec) { decq(reg) ; return; } 608 /* else */ { subq(reg, value) ; return; } 609 } 610 611 void MacroAssembler::decrementq(Address dst, int value) { 612 if (value == min_jint) { subq(dst, value); return; } 613 if (value < 0) { incrementq(dst, -value); return; } 614 if (value == 0) { ; return; } 615 if (value == 1 && UseIncDec) { decq(dst) ; return; } 616 /* else */ { subq(dst, value) ; return; } 617 } 618 619 void MacroAssembler::incrementq(AddressLiteral dst) { 620 if (reachable(dst)) { 621 incrementq(as_Address(dst)); 622 } else { 623 lea(rscratch1, dst); 624 incrementq(Address(rscratch1, 0)); 625 } 626 } 627 628 void MacroAssembler::incrementq(Register reg, int value) { 629 if (value == min_jint) { addq(reg, value); return; } 630 if (value < 0) { decrementq(reg, -value); return; } 631 if (value == 0) { ; return; } 632 if (value == 1 && UseIncDec) { incq(reg) ; return; } 633 /* else */ { addq(reg, value) ; return; } 634 } 635 636 void MacroAssembler::incrementq(Address dst, int value) { 637 if (value == min_jint) { addq(dst, value); return; } 638 if (value < 0) { decrementq(dst, -value); return; } 639 if (value == 0) { ; return; } 640 if (value == 1 && UseIncDec) { incq(dst) ; return; } 641 /* else */ { addq(dst, value) ; return; } 642 } 643 644 // 32bit can do a case table jump in one instruction but we no longer allow the base 645 // to be installed in the Address class 646 void MacroAssembler::jump(ArrayAddress entry) { 647 lea(rscratch1, entry.base()); 648 Address dispatch = entry.index(); 649 assert(dispatch._base == noreg, "must be"); 650 dispatch._base = rscratch1; 651 jmp(dispatch); 652 } 653 654 void MacroAssembler::lcmp2int(Register x_hi, Register x_lo, Register y_hi, Register y_lo) { 655 ShouldNotReachHere(); // 64bit doesn't use two regs 656 cmpq(x_lo, y_lo); 657 } 658 659 void MacroAssembler::lea(Register dst, AddressLiteral src) { 660 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 661 } 662 663 void MacroAssembler::lea(Address dst, AddressLiteral adr) { 664 mov_literal64(rscratch1, (intptr_t)adr.target(), adr.rspec()); 665 movptr(dst, rscratch1); 666 } 667 668 void MacroAssembler::leave() { 669 // %%% is this really better? Why not on 32bit too? 670 emit_int8((unsigned char)0xC9); // LEAVE 671 } 672 673 void MacroAssembler::lneg(Register hi, Register lo) { 674 ShouldNotReachHere(); // 64bit doesn't use two regs 675 negq(lo); 676 } 677 678 void MacroAssembler::movoop(Register dst, jobject obj) { 679 mov_literal64(dst, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 680 } 681 682 void MacroAssembler::movoop(Address dst, jobject obj) { 683 mov_literal64(rscratch1, (intptr_t)obj, oop_Relocation::spec_for_immediate()); 684 movq(dst, rscratch1); 685 } 686 687 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 688 mov_literal64(dst, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 689 } 690 691 void MacroAssembler::mov_metadata(Address dst, Metadata* obj) { 692 mov_literal64(rscratch1, (intptr_t)obj, metadata_Relocation::spec_for_immediate()); 693 movq(dst, rscratch1); 694 } 695 696 void MacroAssembler::movptr(Register dst, AddressLiteral src, Register scratch) { 697 if (src.is_lval()) { 698 mov_literal64(dst, (intptr_t)src.target(), src.rspec()); 699 } else { 700 if (reachable(src)) { 701 movq(dst, as_Address(src)); 702 } else { 703 lea(scratch, src); 704 movq(dst, Address(scratch, 0)); 705 } 706 } 707 } 708 709 void MacroAssembler::movptr(ArrayAddress dst, Register src) { 710 movq(as_Address(dst), src); 711 } 712 713 void MacroAssembler::movptr(Register dst, ArrayAddress src) { 714 movq(dst, as_Address(src)); 715 } 716 717 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 718 void MacroAssembler::movptr(Address dst, intptr_t src) { 719 mov64(rscratch1, src); 720 movq(dst, rscratch1); 721 } 722 723 // These are mostly for initializing NULL 724 void MacroAssembler::movptr(Address dst, int32_t src) { 725 movslq(dst, src); 726 } 727 728 void MacroAssembler::movptr(Register dst, int32_t src) { 729 mov64(dst, (intptr_t)src); 730 } 731 732 void MacroAssembler::pushoop(jobject obj) { 733 movoop(rscratch1, obj); 734 push(rscratch1); 735 } 736 737 void MacroAssembler::pushklass(Metadata* obj) { 738 mov_metadata(rscratch1, obj); 739 push(rscratch1); 740 } 741 742 void MacroAssembler::pushptr(AddressLiteral src) { 743 lea(rscratch1, src); 744 if (src.is_lval()) { 745 push(rscratch1); 746 } else { 747 pushq(Address(rscratch1, 0)); 748 } 749 } 750 751 void MacroAssembler::reset_last_Java_frame(bool clear_fp, 752 bool clear_pc) { 753 // we must set sp to zero to clear frame 754 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 755 // must clear fp, so that compiled frames are not confused; it is 756 // possible that we need it only for debugging 757 if (clear_fp) { 758 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 759 } 760 761 if (clear_pc) { 762 movptr(Address(r15_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 763 } 764 } 765 766 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 767 Register last_java_fp, 768 address last_java_pc) { 769 // determine last_java_sp register 770 if (!last_java_sp->is_valid()) { 771 last_java_sp = rsp; 772 } 773 774 // last_java_fp is optional 775 if (last_java_fp->is_valid()) { 776 movptr(Address(r15_thread, JavaThread::last_Java_fp_offset()), 777 last_java_fp); 778 } 779 780 // last_java_pc is optional 781 if (last_java_pc != NULL) { 782 Address java_pc(r15_thread, 783 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()); 784 lea(rscratch1, InternalAddress(last_java_pc)); 785 movptr(java_pc, rscratch1); 786 } 787 788 movptr(Address(r15_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 789 } 790 791 static void pass_arg0(MacroAssembler* masm, Register arg) { 792 if (c_rarg0 != arg ) { 793 masm->mov(c_rarg0, arg); 794 } 795 } 796 797 static void pass_arg1(MacroAssembler* masm, Register arg) { 798 if (c_rarg1 != arg ) { 799 masm->mov(c_rarg1, arg); 800 } 801 } 802 803 static void pass_arg2(MacroAssembler* masm, Register arg) { 804 if (c_rarg2 != arg ) { 805 masm->mov(c_rarg2, arg); 806 } 807 } 808 809 static void pass_arg3(MacroAssembler* masm, Register arg) { 810 if (c_rarg3 != arg ) { 811 masm->mov(c_rarg3, arg); 812 } 813 } 814 815 void MacroAssembler::stop(const char* msg) { 816 address rip = pc(); 817 pusha(); // get regs on stack 818 lea(c_rarg0, ExternalAddress((address) msg)); 819 lea(c_rarg1, InternalAddress(rip)); 820 movq(c_rarg2, rsp); // pass pointer to regs array 821 andq(rsp, -16); // align stack as required by ABI 822 call(RuntimeAddress(CAST_FROM_FN_PTR(address, MacroAssembler::debug64))); 823 hlt(); 824 } 825 826 void MacroAssembler::warn(const char* msg) { 827 push(rbp); 828 movq(rbp, rsp); 829 andq(rsp, -16); // align stack as required by push_CPU_state and call 830 push_CPU_state(); // keeps alignment at 16 bytes 831 lea(c_rarg0, ExternalAddress((address) msg)); 832 call_VM_leaf(CAST_FROM_FN_PTR(address, warning), c_rarg0); 833 pop_CPU_state(); 834 mov(rsp, rbp); 835 pop(rbp); 836 } 837 838 void MacroAssembler::print_state() { 839 address rip = pc(); 840 pusha(); // get regs on stack 841 push(rbp); 842 movq(rbp, rsp); 843 andq(rsp, -16); // align stack as required by push_CPU_state and call 844 push_CPU_state(); // keeps alignment at 16 bytes 845 846 lea(c_rarg0, InternalAddress(rip)); 847 lea(c_rarg1, Address(rbp, wordSize)); // pass pointer to regs array 848 call_VM_leaf(CAST_FROM_FN_PTR(address, MacroAssembler::print_state64), c_rarg0, c_rarg1); 849 850 pop_CPU_state(); 851 mov(rsp, rbp); 852 pop(rbp); 853 popa(); 854 } 855 856 #ifndef PRODUCT 857 extern "C" void findpc(intptr_t x); 858 #endif 859 860 void MacroAssembler::debug64(char* msg, int64_t pc, int64_t regs[]) { 861 // In order to get locks to work, we need to fake a in_VM state 862 if (ShowMessageBoxOnError) { 863 JavaThread* thread = JavaThread::current(); 864 JavaThreadState saved_state = thread->thread_state(); 865 thread->set_thread_state(_thread_in_vm); 866 #ifndef PRODUCT 867 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 868 ttyLocker ttyl; 869 BytecodeCounter::print(); 870 } 871 #endif 872 // To see where a verify_oop failed, get $ebx+40/X for this frame. 873 // XXX correct this offset for amd64 874 // This is the value of eip which points to where verify_oop will return. 875 if (os::message_box(msg, "Execution stopped, print registers?")) { 876 print_state64(pc, regs); 877 BREAKPOINT; 878 assert(false, "start up GDB"); 879 } 880 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 881 } else { 882 ttyLocker ttyl; 883 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", 884 msg); 885 assert(false, err_msg("DEBUG MESSAGE: %s", msg)); 886 } 887 } 888 889 void MacroAssembler::print_state64(int64_t pc, int64_t regs[]) { 890 ttyLocker ttyl; 891 FlagSetting fs(Debugging, true); 892 tty->print_cr("rip = 0x%016lx", pc); 893 #ifndef PRODUCT 894 tty->cr(); 895 findpc(pc); 896 tty->cr(); 897 #endif 898 #define PRINT_REG(rax, value) \ 899 { tty->print("%s = ", #rax); os::print_location(tty, value); } 900 PRINT_REG(rax, regs[15]); 901 PRINT_REG(rbx, regs[12]); 902 PRINT_REG(rcx, regs[14]); 903 PRINT_REG(rdx, regs[13]); 904 PRINT_REG(rdi, regs[8]); 905 PRINT_REG(rsi, regs[9]); 906 PRINT_REG(rbp, regs[10]); 907 PRINT_REG(rsp, regs[11]); 908 PRINT_REG(r8 , regs[7]); 909 PRINT_REG(r9 , regs[6]); 910 PRINT_REG(r10, regs[5]); 911 PRINT_REG(r11, regs[4]); 912 PRINT_REG(r12, regs[3]); 913 PRINT_REG(r13, regs[2]); 914 PRINT_REG(r14, regs[1]); 915 PRINT_REG(r15, regs[0]); 916 #undef PRINT_REG 917 // Print some words near top of staack. 918 int64_t* rsp = (int64_t*) regs[11]; 919 int64_t* dump_sp = rsp; 920 for (int col1 = 0; col1 < 8; col1++) { 921 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp); 922 os::print_location(tty, *dump_sp++); 923 } 924 for (int row = 0; row < 25; row++) { 925 tty->print("(rsp+0x%03x) 0x%016lx: ", (int)((intptr_t)dump_sp - (intptr_t)rsp), (int64_t)dump_sp); 926 for (int col = 0; col < 4; col++) { 927 tty->print(" 0x%016lx", *dump_sp++); 928 } 929 tty->cr(); 930 } 931 // Print some instructions around pc: 932 Disassembler::decode((address)pc-64, (address)pc); 933 tty->print_cr("--------"); 934 Disassembler::decode((address)pc, (address)pc+32); 935 } 936 937 #endif // _LP64 938 939 // Now versions that are common to 32/64 bit 940 941 void MacroAssembler::addptr(Register dst, int32_t imm32) { 942 LP64_ONLY(addq(dst, imm32)) NOT_LP64(addl(dst, imm32)); 943 } 944 945 void MacroAssembler::addptr(Register dst, Register src) { 946 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 947 } 948 949 void MacroAssembler::addptr(Address dst, Register src) { 950 LP64_ONLY(addq(dst, src)) NOT_LP64(addl(dst, src)); 951 } 952 953 void MacroAssembler::addsd(XMMRegister dst, AddressLiteral src) { 954 if (reachable(src)) { 955 Assembler::addsd(dst, as_Address(src)); 956 } else { 957 lea(rscratch1, src); 958 Assembler::addsd(dst, Address(rscratch1, 0)); 959 } 960 } 961 962 void MacroAssembler::addss(XMMRegister dst, AddressLiteral src) { 963 if (reachable(src)) { 964 addss(dst, as_Address(src)); 965 } else { 966 lea(rscratch1, src); 967 addss(dst, Address(rscratch1, 0)); 968 } 969 } 970 971 void MacroAssembler::align(int modulus) { 972 if (offset() % modulus != 0) { 973 nop(modulus - (offset() % modulus)); 974 } 975 } 976 977 void MacroAssembler::andpd(XMMRegister dst, AddressLiteral src) { 978 // Used in sign-masking with aligned address. 979 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 980 if (reachable(src)) { 981 Assembler::andpd(dst, as_Address(src)); 982 } else { 983 lea(rscratch1, src); 984 Assembler::andpd(dst, Address(rscratch1, 0)); 985 } 986 } 987 988 void MacroAssembler::andps(XMMRegister dst, AddressLiteral src) { 989 // Used in sign-masking with aligned address. 990 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 991 if (reachable(src)) { 992 Assembler::andps(dst, as_Address(src)); 993 } else { 994 lea(rscratch1, src); 995 Assembler::andps(dst, Address(rscratch1, 0)); 996 } 997 } 998 999 void MacroAssembler::andptr(Register dst, int32_t imm32) { 1000 LP64_ONLY(andq(dst, imm32)) NOT_LP64(andl(dst, imm32)); 1001 } 1002 1003 void MacroAssembler::atomic_incl(Address counter_addr) { 1004 if (os::is_MP()) 1005 lock(); 1006 incrementl(counter_addr); 1007 } 1008 1009 void MacroAssembler::atomic_incl(AddressLiteral counter_addr, Register scr) { 1010 if (reachable(counter_addr)) { 1011 atomic_incl(as_Address(counter_addr)); 1012 } else { 1013 lea(scr, counter_addr); 1014 atomic_incl(Address(scr, 0)); 1015 } 1016 } 1017 1018 #ifdef _LP64 1019 void MacroAssembler::atomic_incq(Address counter_addr) { 1020 if (os::is_MP()) 1021 lock(); 1022 incrementq(counter_addr); 1023 } 1024 1025 void MacroAssembler::atomic_incq(AddressLiteral counter_addr, Register scr) { 1026 if (reachable(counter_addr)) { 1027 atomic_incq(as_Address(counter_addr)); 1028 } else { 1029 lea(scr, counter_addr); 1030 atomic_incq(Address(scr, 0)); 1031 } 1032 } 1033 #endif 1034 1035 // Writes to stack successive pages until offset reached to check for 1036 // stack overflow + shadow pages. This clobbers tmp. 1037 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 1038 movptr(tmp, rsp); 1039 // Bang stack for total size given plus shadow page size. 1040 // Bang one page at a time because large size can bang beyond yellow and 1041 // red zones. 1042 Label loop; 1043 bind(loop); 1044 movl(Address(tmp, (-os::vm_page_size())), size ); 1045 subptr(tmp, os::vm_page_size()); 1046 subl(size, os::vm_page_size()); 1047 jcc(Assembler::greater, loop); 1048 1049 // Bang down shadow pages too. 1050 // At this point, (tmp-0) is the last address touched, so don't 1051 // touch it again. (It was touched as (tmp-pagesize) but then tmp 1052 // was post-decremented.) Skip this address by starting at i=1, and 1053 // touch a few more pages below. N.B. It is important to touch all 1054 // the way down to and including i=StackShadowPages. 1055 for (int i = 1; i < StackShadowPages; i++) { 1056 // this could be any sized move but this is can be a debugging crumb 1057 // so the bigger the better. 1058 movptr(Address(tmp, (-i*os::vm_page_size())), size ); 1059 } 1060 } 1061 1062 int MacroAssembler::biased_locking_enter(Register lock_reg, 1063 Register obj_reg, 1064 Register swap_reg, 1065 Register tmp_reg, 1066 bool swap_reg_contains_mark, 1067 Label& done, 1068 Label* slow_case, 1069 BiasedLockingCounters* counters) { 1070 assert(UseBiasedLocking, "why call this otherwise?"); 1071 assert(swap_reg == rax, "swap_reg must be rax for cmpxchgq"); 1072 assert(tmp_reg != noreg, "tmp_reg must be supplied"); 1073 assert_different_registers(lock_reg, obj_reg, swap_reg, tmp_reg); 1074 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 1075 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 1076 Address saved_mark_addr(lock_reg, 0); 1077 1078 if (PrintBiasedLockingStatistics && counters == NULL) { 1079 counters = BiasedLocking::counters(); 1080 } 1081 // Biased locking 1082 // See whether the lock is currently biased toward our thread and 1083 // whether the epoch is still valid 1084 // Note that the runtime guarantees sufficient alignment of JavaThread 1085 // pointers to allow age to be placed into low bits 1086 // First check to see whether biasing is even enabled for this object 1087 Label cas_label; 1088 int null_check_offset = -1; 1089 if (!swap_reg_contains_mark) { 1090 null_check_offset = offset(); 1091 movptr(swap_reg, mark_addr); 1092 } 1093 movptr(tmp_reg, swap_reg); 1094 andptr(tmp_reg, markOopDesc::biased_lock_mask_in_place); 1095 cmpptr(tmp_reg, markOopDesc::biased_lock_pattern); 1096 jcc(Assembler::notEqual, cas_label); 1097 // The bias pattern is present in the object's header. Need to check 1098 // whether the bias owner and the epoch are both still current. 1099 #ifndef _LP64 1100 // Note that because there is no current thread register on x86_32 we 1101 // need to store off the mark word we read out of the object to 1102 // avoid reloading it and needing to recheck invariants below. This 1103 // store is unfortunate but it makes the overall code shorter and 1104 // simpler. 1105 movptr(saved_mark_addr, swap_reg); 1106 #endif 1107 if (swap_reg_contains_mark) { 1108 null_check_offset = offset(); 1109 } 1110 load_prototype_header(tmp_reg, obj_reg); 1111 #ifdef _LP64 1112 orptr(tmp_reg, r15_thread); 1113 xorptr(tmp_reg, swap_reg); 1114 Register header_reg = tmp_reg; 1115 #else 1116 xorptr(tmp_reg, swap_reg); 1117 get_thread(swap_reg); 1118 xorptr(swap_reg, tmp_reg); 1119 Register header_reg = swap_reg; 1120 #endif 1121 andptr(header_reg, ~((int) markOopDesc::age_mask_in_place)); 1122 if (counters != NULL) { 1123 cond_inc32(Assembler::zero, 1124 ExternalAddress((address) counters->biased_lock_entry_count_addr())); 1125 } 1126 jcc(Assembler::equal, done); 1127 1128 Label try_revoke_bias; 1129 Label try_rebias; 1130 1131 // At this point we know that the header has the bias pattern and 1132 // that we are not the bias owner in the current epoch. We need to 1133 // figure out more details about the state of the header in order to 1134 // know what operations can be legally performed on the object's 1135 // header. 1136 1137 // If the low three bits in the xor result aren't clear, that means 1138 // the prototype header is no longer biased and we have to revoke 1139 // the bias on this object. 1140 testptr(header_reg, markOopDesc::biased_lock_mask_in_place); 1141 jccb(Assembler::notZero, try_revoke_bias); 1142 1143 // Biasing is still enabled for this data type. See whether the 1144 // epoch of the current bias is still valid, meaning that the epoch 1145 // bits of the mark word are equal to the epoch bits of the 1146 // prototype header. (Note that the prototype header's epoch bits 1147 // only change at a safepoint.) If not, attempt to rebias the object 1148 // toward the current thread. Note that we must be absolutely sure 1149 // that the current epoch is invalid in order to do this because 1150 // otherwise the manipulations it performs on the mark word are 1151 // illegal. 1152 testptr(header_reg, markOopDesc::epoch_mask_in_place); 1153 jccb(Assembler::notZero, try_rebias); 1154 1155 // The epoch of the current bias is still valid but we know nothing 1156 // about the owner; it might be set or it might be clear. Try to 1157 // acquire the bias of the object using an atomic operation. If this 1158 // fails we will go in to the runtime to revoke the object's bias. 1159 // Note that we first construct the presumed unbiased header so we 1160 // don't accidentally blow away another thread's valid bias. 1161 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1162 andptr(swap_reg, 1163 markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 1164 #ifdef _LP64 1165 movptr(tmp_reg, swap_reg); 1166 orptr(tmp_reg, r15_thread); 1167 #else 1168 get_thread(tmp_reg); 1169 orptr(tmp_reg, swap_reg); 1170 #endif 1171 if (os::is_MP()) { 1172 lock(); 1173 } 1174 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1175 // If the biasing toward our thread failed, this means that 1176 // another thread succeeded in biasing it toward itself and we 1177 // need to revoke that bias. The revocation will occur in the 1178 // interpreter runtime in the slow case. 1179 if (counters != NULL) { 1180 cond_inc32(Assembler::zero, 1181 ExternalAddress((address) counters->anonymously_biased_lock_entry_count_addr())); 1182 } 1183 if (slow_case != NULL) { 1184 jcc(Assembler::notZero, *slow_case); 1185 } 1186 jmp(done); 1187 1188 bind(try_rebias); 1189 // At this point we know the epoch has expired, meaning that the 1190 // current "bias owner", if any, is actually invalid. Under these 1191 // circumstances _only_, we are allowed to use the current header's 1192 // value as the comparison value when doing the cas to acquire the 1193 // bias in the current epoch. In other words, we allow transfer of 1194 // the bias from one thread to another directly in this situation. 1195 // 1196 // FIXME: due to a lack of registers we currently blow away the age 1197 // bits in this situation. Should attempt to preserve them. 1198 load_prototype_header(tmp_reg, obj_reg); 1199 #ifdef _LP64 1200 orptr(tmp_reg, r15_thread); 1201 #else 1202 get_thread(swap_reg); 1203 orptr(tmp_reg, swap_reg); 1204 movptr(swap_reg, saved_mark_addr); 1205 #endif 1206 if (os::is_MP()) { 1207 lock(); 1208 } 1209 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1210 // If the biasing toward our thread failed, then another thread 1211 // succeeded in biasing it toward itself and we need to revoke that 1212 // bias. The revocation will occur in the runtime in the slow case. 1213 if (counters != NULL) { 1214 cond_inc32(Assembler::zero, 1215 ExternalAddress((address) counters->rebiased_lock_entry_count_addr())); 1216 } 1217 if (slow_case != NULL) { 1218 jcc(Assembler::notZero, *slow_case); 1219 } 1220 jmp(done); 1221 1222 bind(try_revoke_bias); 1223 // The prototype mark in the klass doesn't have the bias bit set any 1224 // more, indicating that objects of this data type are not supposed 1225 // to be biased any more. We are going to try to reset the mark of 1226 // this object to the prototype value and fall through to the 1227 // CAS-based locking scheme. Note that if our CAS fails, it means 1228 // that another thread raced us for the privilege of revoking the 1229 // bias of this particular object, so it's okay to continue in the 1230 // normal locking code. 1231 // 1232 // FIXME: due to a lack of registers we currently blow away the age 1233 // bits in this situation. Should attempt to preserve them. 1234 NOT_LP64( movptr(swap_reg, saved_mark_addr); ) 1235 load_prototype_header(tmp_reg, obj_reg); 1236 if (os::is_MP()) { 1237 lock(); 1238 } 1239 cmpxchgptr(tmp_reg, mark_addr); // compare tmp_reg and swap_reg 1240 // Fall through to the normal CAS-based lock, because no matter what 1241 // the result of the above CAS, some thread must have succeeded in 1242 // removing the bias bit from the object's header. 1243 if (counters != NULL) { 1244 cond_inc32(Assembler::zero, 1245 ExternalAddress((address) counters->revoked_lock_entry_count_addr())); 1246 } 1247 1248 bind(cas_label); 1249 1250 return null_check_offset; 1251 } 1252 1253 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 1254 assert(UseBiasedLocking, "why call this otherwise?"); 1255 1256 // Check for biased locking unlock case, which is a no-op 1257 // Note: we do not have to check the thread ID for two reasons. 1258 // First, the interpreter checks for IllegalMonitorStateException at 1259 // a higher level. Second, if the bias was revoked while we held the 1260 // lock, the object could not be rebiased toward another thread, so 1261 // the bias bit would be clear. 1262 movptr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 1263 andptr(temp_reg, markOopDesc::biased_lock_mask_in_place); 1264 cmpptr(temp_reg, markOopDesc::biased_lock_pattern); 1265 jcc(Assembler::equal, done); 1266 } 1267 1268 #ifdef COMPILER2 1269 1270 #if INCLUDE_RTM_OPT 1271 1272 // Update rtm_counters based on abort status 1273 // input: abort_status 1274 // rtm_counters (RTMLockingCounters*) 1275 // flags are killed 1276 void MacroAssembler::rtm_counters_update(Register abort_status, Register rtm_counters) { 1277 1278 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abort_count_offset())); 1279 if (PrintPreciseRTMLockingStatistics) { 1280 for (int i = 0; i < RTMLockingCounters::ABORT_STATUS_LIMIT; i++) { 1281 Label check_abort; 1282 testl(abort_status, (1<<i)); 1283 jccb(Assembler::equal, check_abort); 1284 atomic_incptr(Address(rtm_counters, RTMLockingCounters::abortX_count_offset() + (i * sizeof(uintx)))); 1285 bind(check_abort); 1286 } 1287 } 1288 } 1289 1290 // Branch if (random & (count-1) != 0), count is 2^n 1291 // tmp, scr and flags are killed 1292 void MacroAssembler::branch_on_random_using_rdtsc(Register tmp, Register scr, int count, Label& brLabel) { 1293 assert(tmp == rax, ""); 1294 assert(scr == rdx, ""); 1295 rdtsc(); // modifies EDX:EAX 1296 andptr(tmp, count-1); 1297 jccb(Assembler::notZero, brLabel); 1298 } 1299 1300 // Perform abort ratio calculation, set no_rtm bit if high ratio 1301 // input: rtm_counters_Reg (RTMLockingCounters* address) 1302 // tmpReg, rtm_counters_Reg and flags are killed 1303 void MacroAssembler::rtm_abort_ratio_calculation(Register tmpReg, 1304 Register rtm_counters_Reg, 1305 RTMLockingCounters* rtm_counters, 1306 Metadata* method_data) { 1307 Label L_done, L_check_always_rtm1, L_check_always_rtm2; 1308 1309 if (RTMLockingCalculationDelay > 0) { 1310 // Delay calculation 1311 movptr(tmpReg, ExternalAddress((address) RTMLockingCounters::rtm_calculation_flag_addr()), tmpReg); 1312 testptr(tmpReg, tmpReg); 1313 jccb(Assembler::equal, L_done); 1314 } 1315 // Abort ratio calculation only if abort_count > RTMAbortThreshold 1316 // Aborted transactions = abort_count * 100 1317 // All transactions = total_count * RTMTotalCountIncrRate 1318 // Set no_rtm bit if (Aborted transactions >= All transactions * RTMAbortRatio) 1319 1320 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::abort_count_offset())); 1321 cmpptr(tmpReg, RTMAbortThreshold); 1322 jccb(Assembler::below, L_check_always_rtm2); 1323 imulptr(tmpReg, tmpReg, 100); 1324 1325 Register scrReg = rtm_counters_Reg; 1326 movptr(scrReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 1327 imulptr(scrReg, scrReg, RTMTotalCountIncrRate); 1328 imulptr(scrReg, scrReg, RTMAbortRatio); 1329 cmpptr(tmpReg, scrReg); 1330 jccb(Assembler::below, L_check_always_rtm1); 1331 if (method_data != NULL) { 1332 // set rtm_state to "no rtm" in MDO 1333 mov_metadata(tmpReg, method_data); 1334 if (os::is_MP()) { 1335 lock(); 1336 } 1337 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), NoRTM); 1338 } 1339 jmpb(L_done); 1340 bind(L_check_always_rtm1); 1341 // Reload RTMLockingCounters* address 1342 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 1343 bind(L_check_always_rtm2); 1344 movptr(tmpReg, Address(rtm_counters_Reg, RTMLockingCounters::total_count_offset())); 1345 cmpptr(tmpReg, RTMLockingThreshold / RTMTotalCountIncrRate); 1346 jccb(Assembler::below, L_done); 1347 if (method_data != NULL) { 1348 // set rtm_state to "always rtm" in MDO 1349 mov_metadata(tmpReg, method_data); 1350 if (os::is_MP()) { 1351 lock(); 1352 } 1353 orl(Address(tmpReg, MethodData::rtm_state_offset_in_bytes()), UseRTM); 1354 } 1355 bind(L_done); 1356 } 1357 1358 // Update counters and perform abort ratio calculation 1359 // input: abort_status_Reg 1360 // rtm_counters_Reg, flags are killed 1361 void MacroAssembler::rtm_profiling(Register abort_status_Reg, 1362 Register rtm_counters_Reg, 1363 RTMLockingCounters* rtm_counters, 1364 Metadata* method_data, 1365 bool profile_rtm) { 1366 1367 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 1368 // update rtm counters based on rax value at abort 1369 // reads abort_status_Reg, updates flags 1370 lea(rtm_counters_Reg, ExternalAddress((address)rtm_counters)); 1371 rtm_counters_update(abort_status_Reg, rtm_counters_Reg); 1372 if (profile_rtm) { 1373 // Save abort status because abort_status_Reg is used by following code. 1374 if (RTMRetryCount > 0) { 1375 push(abort_status_Reg); 1376 } 1377 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 1378 rtm_abort_ratio_calculation(abort_status_Reg, rtm_counters_Reg, rtm_counters, method_data); 1379 // restore abort status 1380 if (RTMRetryCount > 0) { 1381 pop(abort_status_Reg); 1382 } 1383 } 1384 } 1385 1386 // Retry on abort if abort's status is 0x6: can retry (0x2) | memory conflict (0x4) 1387 // inputs: retry_count_Reg 1388 // : abort_status_Reg 1389 // output: retry_count_Reg decremented by 1 1390 // flags are killed 1391 void MacroAssembler::rtm_retry_lock_on_abort(Register retry_count_Reg, Register abort_status_Reg, Label& retryLabel) { 1392 Label doneRetry; 1393 assert(abort_status_Reg == rax, ""); 1394 // The abort reason bits are in eax (see all states in rtmLocking.hpp) 1395 // 0x6 = conflict on which we can retry (0x2) | memory conflict (0x4) 1396 // if reason is in 0x6 and retry count != 0 then retry 1397 andptr(abort_status_Reg, 0x6); 1398 jccb(Assembler::zero, doneRetry); 1399 testl(retry_count_Reg, retry_count_Reg); 1400 jccb(Assembler::zero, doneRetry); 1401 pause(); 1402 decrementl(retry_count_Reg); 1403 jmp(retryLabel); 1404 bind(doneRetry); 1405 } 1406 1407 // Spin and retry if lock is busy, 1408 // inputs: box_Reg (monitor address) 1409 // : retry_count_Reg 1410 // output: retry_count_Reg decremented by 1 1411 // : clear z flag if retry count exceeded 1412 // tmp_Reg, scr_Reg, flags are killed 1413 void MacroAssembler::rtm_retry_lock_on_busy(Register retry_count_Reg, Register box_Reg, 1414 Register tmp_Reg, Register scr_Reg, Label& retryLabel) { 1415 Label SpinLoop, SpinExit, doneRetry; 1416 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 1417 1418 testl(retry_count_Reg, retry_count_Reg); 1419 jccb(Assembler::zero, doneRetry); 1420 decrementl(retry_count_Reg); 1421 movptr(scr_Reg, RTMSpinLoopCount); 1422 1423 bind(SpinLoop); 1424 pause(); 1425 decrementl(scr_Reg); 1426 jccb(Assembler::lessEqual, SpinExit); 1427 movptr(tmp_Reg, Address(box_Reg, owner_offset)); 1428 testptr(tmp_Reg, tmp_Reg); 1429 jccb(Assembler::notZero, SpinLoop); 1430 1431 bind(SpinExit); 1432 jmp(retryLabel); 1433 bind(doneRetry); 1434 incrementl(retry_count_Reg); // clear z flag 1435 } 1436 1437 // Use RTM for normal stack locks 1438 // Input: objReg (object to lock) 1439 void MacroAssembler::rtm_stack_locking(Register objReg, Register tmpReg, Register scrReg, 1440 Register retry_on_abort_count_Reg, 1441 RTMLockingCounters* stack_rtm_counters, 1442 Metadata* method_data, bool profile_rtm, 1443 Label& DONE_LABEL, Label& IsInflated) { 1444 assert(UseRTMForStackLocks, "why call this otherwise?"); 1445 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 1446 assert(tmpReg == rax, ""); 1447 assert(scrReg == rdx, ""); 1448 Label L_rtm_retry, L_decrement_retry, L_on_abort; 1449 1450 if (RTMRetryCount > 0) { 1451 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 1452 bind(L_rtm_retry); 1453 } 1454 movptr(tmpReg, Address(objReg, 0)); 1455 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 1456 jcc(Assembler::notZero, IsInflated); 1457 1458 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 1459 Label L_noincrement; 1460 if (RTMTotalCountIncrRate > 1) { 1461 // tmpReg, scrReg and flags are killed 1462 branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement); 1463 } 1464 assert(stack_rtm_counters != NULL, "should not be NULL when profiling RTM"); 1465 atomic_incptr(ExternalAddress((address)stack_rtm_counters->total_count_addr()), scrReg); 1466 bind(L_noincrement); 1467 } 1468 xbegin(L_on_abort); 1469 movptr(tmpReg, Address(objReg, 0)); // fetch markword 1470 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 1471 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked 1472 jcc(Assembler::equal, DONE_LABEL); // all done if unlocked 1473 1474 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 1475 if (UseRTMXendForLockBusy) { 1476 xend(); 1477 movptr(abort_status_Reg, 0x2); // Set the abort status to 2 (so we can retry) 1478 jmp(L_decrement_retry); 1479 } 1480 else { 1481 xabort(0); 1482 } 1483 bind(L_on_abort); 1484 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 1485 rtm_profiling(abort_status_Reg, scrReg, stack_rtm_counters, method_data, profile_rtm); 1486 } 1487 bind(L_decrement_retry); 1488 if (RTMRetryCount > 0) { 1489 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 1490 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 1491 } 1492 } 1493 1494 // Use RTM for inflating locks 1495 // inputs: objReg (object to lock) 1496 // boxReg (on-stack box address (displaced header location) - KILLED) 1497 // tmpReg (ObjectMonitor address + markOopDesc::monitor_value) 1498 void MacroAssembler::rtm_inflated_locking(Register objReg, Register boxReg, Register tmpReg, 1499 Register scrReg, Register retry_on_busy_count_Reg, 1500 Register retry_on_abort_count_Reg, 1501 RTMLockingCounters* rtm_counters, 1502 Metadata* method_data, bool profile_rtm, 1503 Label& DONE_LABEL) { 1504 assert(UseRTMLocking, "why call this otherwise?"); 1505 assert(tmpReg == rax, ""); 1506 assert(scrReg == rdx, ""); 1507 Label L_rtm_retry, L_decrement_retry, L_on_abort; 1508 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 1509 1510 // Without cast to int32_t a movptr will destroy r10 which is typically obj 1511 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); 1512 movptr(boxReg, tmpReg); // Save ObjectMonitor address 1513 1514 if (RTMRetryCount > 0) { 1515 movl(retry_on_busy_count_Reg, RTMRetryCount); // Retry on lock busy 1516 movl(retry_on_abort_count_Reg, RTMRetryCount); // Retry on abort 1517 bind(L_rtm_retry); 1518 } 1519 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 1520 Label L_noincrement; 1521 if (RTMTotalCountIncrRate > 1) { 1522 // tmpReg, scrReg and flags are killed 1523 branch_on_random_using_rdtsc(tmpReg, scrReg, (int)RTMTotalCountIncrRate, L_noincrement); 1524 } 1525 assert(rtm_counters != NULL, "should not be NULL when profiling RTM"); 1526 atomic_incptr(ExternalAddress((address)rtm_counters->total_count_addr()), scrReg); 1527 bind(L_noincrement); 1528 } 1529 xbegin(L_on_abort); 1530 movptr(tmpReg, Address(objReg, 0)); 1531 movptr(tmpReg, Address(tmpReg, owner_offset)); 1532 testptr(tmpReg, tmpReg); 1533 jcc(Assembler::zero, DONE_LABEL); 1534 if (UseRTMXendForLockBusy) { 1535 xend(); 1536 jmp(L_decrement_retry); 1537 } 1538 else { 1539 xabort(0); 1540 } 1541 bind(L_on_abort); 1542 Register abort_status_Reg = tmpReg; // status of abort is stored in RAX 1543 if (PrintPreciseRTMLockingStatistics || profile_rtm) { 1544 rtm_profiling(abort_status_Reg, scrReg, rtm_counters, method_data, profile_rtm); 1545 } 1546 if (RTMRetryCount > 0) { 1547 // retry on lock abort if abort status is 'can retry' (0x2) or 'memory conflict' (0x4) 1548 rtm_retry_lock_on_abort(retry_on_abort_count_Reg, abort_status_Reg, L_rtm_retry); 1549 } 1550 1551 movptr(tmpReg, Address(boxReg, owner_offset)) ; 1552 testptr(tmpReg, tmpReg) ; 1553 jccb(Assembler::notZero, L_decrement_retry) ; 1554 1555 // Appears unlocked - try to swing _owner from null to non-null. 1556 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 1557 #ifdef _LP64 1558 Register threadReg = r15_thread; 1559 #else 1560 get_thread(scrReg); 1561 Register threadReg = scrReg; 1562 #endif 1563 if (os::is_MP()) { 1564 lock(); 1565 } 1566 cmpxchgptr(threadReg, Address(boxReg, owner_offset)); // Updates tmpReg 1567 1568 if (RTMRetryCount > 0) { 1569 // success done else retry 1570 jccb(Assembler::equal, DONE_LABEL) ; 1571 bind(L_decrement_retry); 1572 // Spin and retry if lock is busy. 1573 rtm_retry_lock_on_busy(retry_on_busy_count_Reg, boxReg, tmpReg, scrReg, L_rtm_retry); 1574 } 1575 else { 1576 bind(L_decrement_retry); 1577 } 1578 } 1579 1580 #endif // INCLUDE_RTM_OPT 1581 1582 // Fast_Lock and Fast_Unlock used by C2 1583 1584 // Because the transitions from emitted code to the runtime 1585 // monitorenter/exit helper stubs are so slow it's critical that 1586 // we inline both the stack-locking fast-path and the inflated fast path. 1587 // 1588 // See also: cmpFastLock and cmpFastUnlock. 1589 // 1590 // What follows is a specialized inline transliteration of the code 1591 // in slow_enter() and slow_exit(). If we're concerned about I$ bloat 1592 // another option would be to emit TrySlowEnter and TrySlowExit methods 1593 // at startup-time. These methods would accept arguments as 1594 // (rax,=Obj, rbx=Self, rcx=box, rdx=Scratch) and return success-failure 1595 // indications in the icc.ZFlag. Fast_Lock and Fast_Unlock would simply 1596 // marshal the arguments and emit calls to TrySlowEnter and TrySlowExit. 1597 // In practice, however, the # of lock sites is bounded and is usually small. 1598 // Besides the call overhead, TrySlowEnter and TrySlowExit might suffer 1599 // if the processor uses simple bimodal branch predictors keyed by EIP 1600 // Since the helper routines would be called from multiple synchronization 1601 // sites. 1602 // 1603 // An even better approach would be write "MonitorEnter()" and "MonitorExit()" 1604 // in java - using j.u.c and unsafe - and just bind the lock and unlock sites 1605 // to those specialized methods. That'd give us a mostly platform-independent 1606 // implementation that the JITs could optimize and inline at their pleasure. 1607 // Done correctly, the only time we'd need to cross to native could would be 1608 // to park() or unpark() threads. We'd also need a few more unsafe operators 1609 // to (a) prevent compiler-JIT reordering of non-volatile accesses, and 1610 // (b) explicit barriers or fence operations. 1611 // 1612 // TODO: 1613 // 1614 // * Arrange for C2 to pass "Self" into Fast_Lock and Fast_Unlock in one of the registers (scr). 1615 // This avoids manifesting the Self pointer in the Fast_Lock and Fast_Unlock terminals. 1616 // Given TLAB allocation, Self is usually manifested in a register, so passing it into 1617 // the lock operators would typically be faster than reifying Self. 1618 // 1619 // * Ideally I'd define the primitives as: 1620 // fast_lock (nax Obj, nax box, EAX tmp, nax scr) where box, tmp and scr are KILLED. 1621 // fast_unlock (nax Obj, EAX box, nax tmp) where box and tmp are KILLED 1622 // Unfortunately ADLC bugs prevent us from expressing the ideal form. 1623 // Instead, we're stuck with a rather awkward and brittle register assignments below. 1624 // Furthermore the register assignments are overconstrained, possibly resulting in 1625 // sub-optimal code near the synchronization site. 1626 // 1627 // * Eliminate the sp-proximity tests and just use "== Self" tests instead. 1628 // Alternately, use a better sp-proximity test. 1629 // 1630 // * Currently ObjectMonitor._Owner can hold either an sp value or a (THREAD *) value. 1631 // Either one is sufficient to uniquely identify a thread. 1632 // TODO: eliminate use of sp in _owner and use get_thread(tr) instead. 1633 // 1634 // * Intrinsify notify() and notifyAll() for the common cases where the 1635 // object is locked by the calling thread but the waitlist is empty. 1636 // avoid the expensive JNI call to JVM_Notify() and JVM_NotifyAll(). 1637 // 1638 // * use jccb and jmpb instead of jcc and jmp to improve code density. 1639 // But beware of excessive branch density on AMD Opterons. 1640 // 1641 // * Both Fast_Lock and Fast_Unlock set the ICC.ZF to indicate success 1642 // or failure of the fast-path. If the fast-path fails then we pass 1643 // control to the slow-path, typically in C. In Fast_Lock and 1644 // Fast_Unlock we often branch to DONE_LABEL, just to find that C2 1645 // will emit a conditional branch immediately after the node. 1646 // So we have branches to branches and lots of ICC.ZF games. 1647 // Instead, it might be better to have C2 pass a "FailureLabel" 1648 // into Fast_Lock and Fast_Unlock. In the case of success, control 1649 // will drop through the node. ICC.ZF is undefined at exit. 1650 // In the case of failure, the node will branch directly to the 1651 // FailureLabel 1652 1653 1654 // obj: object to lock 1655 // box: on-stack box address (displaced header location) - KILLED 1656 // rax,: tmp -- KILLED 1657 // scr: tmp -- KILLED 1658 void MacroAssembler::fast_lock(Register objReg, Register boxReg, Register tmpReg, 1659 Register scrReg, Register cx1Reg, Register cx2Reg, 1660 BiasedLockingCounters* counters, 1661 RTMLockingCounters* rtm_counters, 1662 RTMLockingCounters* stack_rtm_counters, 1663 Metadata* method_data, 1664 bool use_rtm, bool profile_rtm) { 1665 // Ensure the register assignents are disjoint 1666 assert(tmpReg == rax, ""); 1667 1668 if (use_rtm) { 1669 assert_different_registers(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg); 1670 } else { 1671 assert(cx1Reg == noreg, ""); 1672 assert(cx2Reg == noreg, ""); 1673 assert_different_registers(objReg, boxReg, tmpReg, scrReg); 1674 } 1675 1676 if (counters != NULL) { 1677 atomic_incl(ExternalAddress((address)counters->total_entry_count_addr()), scrReg); 1678 } 1679 if (EmitSync & 1) { 1680 // set box->dhw = markOopDesc::unused_mark() 1681 // Force all sync thru slow-path: slow_enter() and slow_exit() 1682 movptr (Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); 1683 cmpptr (rsp, (int32_t)NULL_WORD); 1684 } else { 1685 // Possible cases that we'll encounter in fast_lock 1686 // ------------------------------------------------ 1687 // * Inflated 1688 // -- unlocked 1689 // -- Locked 1690 // = by self 1691 // = by other 1692 // * biased 1693 // -- by Self 1694 // -- by other 1695 // * neutral 1696 // * stack-locked 1697 // -- by self 1698 // = sp-proximity test hits 1699 // = sp-proximity test generates false-negative 1700 // -- by other 1701 // 1702 1703 Label IsInflated, DONE_LABEL; 1704 1705 // it's stack-locked, biased or neutral 1706 // TODO: optimize away redundant LDs of obj->mark and improve the markword triage 1707 // order to reduce the number of conditional branches in the most common cases. 1708 // Beware -- there's a subtle invariant that fetch of the markword 1709 // at [FETCH], below, will never observe a biased encoding (*101b). 1710 // If this invariant is not held we risk exclusion (safety) failure. 1711 if (UseBiasedLocking && !UseOptoBiasInlining) { 1712 biased_locking_enter(boxReg, objReg, tmpReg, scrReg, false, DONE_LABEL, NULL, counters); 1713 } 1714 1715 #if INCLUDE_RTM_OPT 1716 if (UseRTMForStackLocks && use_rtm) { 1717 rtm_stack_locking(objReg, tmpReg, scrReg, cx2Reg, 1718 stack_rtm_counters, method_data, profile_rtm, 1719 DONE_LABEL, IsInflated); 1720 } 1721 #endif // INCLUDE_RTM_OPT 1722 1723 movptr(tmpReg, Address(objReg, 0)); // [FETCH] 1724 testptr(tmpReg, markOopDesc::monitor_value); // inflated vs stack-locked|neutral|biased 1725 jccb(Assembler::notZero, IsInflated); 1726 1727 // Attempt stack-locking ... 1728 orptr (tmpReg, markOopDesc::unlocked_value); 1729 movptr(Address(boxReg, 0), tmpReg); // Anticipate successful CAS 1730 if (os::is_MP()) { 1731 lock(); 1732 } 1733 cmpxchgptr(boxReg, Address(objReg, 0)); // Updates tmpReg 1734 if (counters != NULL) { 1735 cond_inc32(Assembler::equal, 1736 ExternalAddress((address)counters->fast_path_entry_count_addr())); 1737 } 1738 jcc(Assembler::equal, DONE_LABEL); // Success 1739 1740 // Recursive locking. 1741 // The object is stack-locked: markword contains stack pointer to BasicLock. 1742 // Locked by current thread if difference with current SP is less than one page. 1743 subptr(tmpReg, rsp); 1744 // Next instruction set ZFlag == 1 (Success) if difference is less then one page. 1745 andptr(tmpReg, (int32_t) (NOT_LP64(0xFFFFF003) LP64_ONLY(7 - os::vm_page_size())) ); 1746 movptr(Address(boxReg, 0), tmpReg); 1747 if (counters != NULL) { 1748 cond_inc32(Assembler::equal, 1749 ExternalAddress((address)counters->fast_path_entry_count_addr())); 1750 } 1751 jmp(DONE_LABEL); 1752 1753 bind(IsInflated); 1754 // The object is inflated. tmpReg contains pointer to ObjectMonitor* + markOopDesc::monitor_value 1755 1756 #if INCLUDE_RTM_OPT 1757 // Use the same RTM locking code in 32- and 64-bit VM. 1758 if (use_rtm) { 1759 rtm_inflated_locking(objReg, boxReg, tmpReg, scrReg, cx1Reg, cx2Reg, 1760 rtm_counters, method_data, profile_rtm, DONE_LABEL); 1761 } else { 1762 #endif // INCLUDE_RTM_OPT 1763 1764 #ifndef _LP64 1765 // The object is inflated. 1766 1767 // boxReg refers to the on-stack BasicLock in the current frame. 1768 // We'd like to write: 1769 // set box->_displaced_header = markOopDesc::unused_mark(). Any non-0 value suffices. 1770 // This is convenient but results a ST-before-CAS penalty. The following CAS suffers 1771 // additional latency as we have another ST in the store buffer that must drain. 1772 1773 if (EmitSync & 8192) { 1774 movptr(Address(boxReg, 0), 3); // results in ST-before-CAS penalty 1775 get_thread (scrReg); 1776 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 1777 movptr(tmpReg, NULL_WORD); // consider: xor vs mov 1778 if (os::is_MP()) { 1779 lock(); 1780 } 1781 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1782 } else 1783 if ((EmitSync & 128) == 0) { // avoid ST-before-CAS 1784 movptr(scrReg, boxReg); 1785 movptr(boxReg, tmpReg); // consider: LEA box, [tmp-2] 1786 1787 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 1788 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 1789 // prefetchw [eax + Offset(_owner)-2] 1790 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1791 } 1792 1793 if ((EmitSync & 64) == 0) { 1794 // Optimistic form: consider XORL tmpReg,tmpReg 1795 movptr(tmpReg, NULL_WORD); 1796 } else { 1797 // Can suffer RTS->RTO upgrades on shared or cold $ lines 1798 // Test-And-CAS instead of CAS 1799 movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner 1800 testptr(tmpReg, tmpReg); // Locked ? 1801 jccb (Assembler::notZero, DONE_LABEL); 1802 } 1803 1804 // Appears unlocked - try to swing _owner from null to non-null. 1805 // Ideally, I'd manifest "Self" with get_thread and then attempt 1806 // to CAS the register containing Self into m->Owner. 1807 // But we don't have enough registers, so instead we can either try to CAS 1808 // rsp or the address of the box (in scr) into &m->owner. If the CAS succeeds 1809 // we later store "Self" into m->Owner. Transiently storing a stack address 1810 // (rsp or the address of the box) into m->owner is harmless. 1811 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 1812 if (os::is_MP()) { 1813 lock(); 1814 } 1815 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1816 movptr(Address(scrReg, 0), 3); // box->_displaced_header = 3 1817 jccb (Assembler::notZero, DONE_LABEL); 1818 get_thread (scrReg); // beware: clobbers ICCs 1819 movptr(Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), scrReg); 1820 xorptr(boxReg, boxReg); // set icc.ZFlag = 1 to indicate success 1821 1822 // If the CAS fails we can either retry or pass control to the slow-path. 1823 // We use the latter tactic. 1824 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 1825 // If the CAS was successful ... 1826 // Self has acquired the lock 1827 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 1828 // Intentional fall-through into DONE_LABEL ... 1829 } else { 1830 movptr(Address(boxReg, 0), intptr_t(markOopDesc::unused_mark())); // results in ST-before-CAS penalty 1831 movptr(boxReg, tmpReg); 1832 1833 // Using a prefetchw helps avoid later RTS->RTO upgrades and cache probes 1834 if ((EmitSync & 2048) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 1835 // prefetchw [eax + Offset(_owner)-2] 1836 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1837 } 1838 1839 if ((EmitSync & 64) == 0) { 1840 // Optimistic form 1841 xorptr (tmpReg, tmpReg); 1842 } else { 1843 // Can suffer RTS->RTO upgrades on shared or cold $ lines 1844 movptr(tmpReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); // rax, = m->_owner 1845 testptr(tmpReg, tmpReg); // Locked ? 1846 jccb (Assembler::notZero, DONE_LABEL); 1847 } 1848 1849 // Appears unlocked - try to swing _owner from null to non-null. 1850 // Use either "Self" (in scr) or rsp as thread identity in _owner. 1851 // Invariant: tmpReg == 0. tmpReg is EAX which is the implicit cmpxchg comparand. 1852 get_thread (scrReg); 1853 if (os::is_MP()) { 1854 lock(); 1855 } 1856 cmpxchgptr(scrReg, Address(boxReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1857 1858 // If the CAS fails we can either retry or pass control to the slow-path. 1859 // We use the latter tactic. 1860 // Pass the CAS result in the icc.ZFlag into DONE_LABEL 1861 // If the CAS was successful ... 1862 // Self has acquired the lock 1863 // Invariant: m->_recursions should already be 0, so we don't need to explicitly set it. 1864 // Intentional fall-through into DONE_LABEL ... 1865 } 1866 #else // _LP64 1867 // It's inflated 1868 movq(scrReg, tmpReg); 1869 xorq(tmpReg, tmpReg); 1870 1871 if (os::is_MP()) { 1872 lock(); 1873 } 1874 cmpxchgptr(r15_thread, Address(scrReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 1875 // Unconditionally set box->_displaced_header = markOopDesc::unused_mark(). 1876 // Without cast to int32_t movptr will destroy r10 which is typically obj. 1877 movptr(Address(boxReg, 0), (int32_t)intptr_t(markOopDesc::unused_mark())); 1878 // Intentional fall-through into DONE_LABEL ... 1879 // Propagate ICC.ZF from CAS above into DONE_LABEL. 1880 #endif // _LP64 1881 #if INCLUDE_RTM_OPT 1882 } // use_rtm() 1883 #endif 1884 // DONE_LABEL is a hot target - we'd really like to place it at the 1885 // start of cache line by padding with NOPs. 1886 // See the AMD and Intel software optimization manuals for the 1887 // most efficient "long" NOP encodings. 1888 // Unfortunately none of our alignment mechanisms suffice. 1889 bind(DONE_LABEL); 1890 1891 // At DONE_LABEL the icc ZFlag is set as follows ... 1892 // Fast_Unlock uses the same protocol. 1893 // ZFlag == 1 -> Success 1894 // ZFlag == 0 -> Failure - force control through the slow-path 1895 } 1896 } 1897 1898 // obj: object to unlock 1899 // box: box address (displaced header location), killed. Must be EAX. 1900 // tmp: killed, cannot be obj nor box. 1901 // 1902 // Some commentary on balanced locking: 1903 // 1904 // Fast_Lock and Fast_Unlock are emitted only for provably balanced lock sites. 1905 // Methods that don't have provably balanced locking are forced to run in the 1906 // interpreter - such methods won't be compiled to use fast_lock and fast_unlock. 1907 // The interpreter provides two properties: 1908 // I1: At return-time the interpreter automatically and quietly unlocks any 1909 // objects acquired the current activation (frame). Recall that the 1910 // interpreter maintains an on-stack list of locks currently held by 1911 // a frame. 1912 // I2: If a method attempts to unlock an object that is not held by the 1913 // the frame the interpreter throws IMSX. 1914 // 1915 // Lets say A(), which has provably balanced locking, acquires O and then calls B(). 1916 // B() doesn't have provably balanced locking so it runs in the interpreter. 1917 // Control returns to A() and A() unlocks O. By I1 and I2, above, we know that O 1918 // is still locked by A(). 1919 // 1920 // The only other source of unbalanced locking would be JNI. The "Java Native Interface: 1921 // Programmer's Guide and Specification" claims that an object locked by jni_monitorenter 1922 // should not be unlocked by "normal" java-level locking and vice-versa. The specification 1923 // doesn't specify what will occur if a program engages in such mixed-mode locking, however. 1924 // Arguably given that the spec legislates the JNI case as undefined our implementation 1925 // could reasonably *avoid* checking owner in Fast_Unlock(). 1926 // In the interest of performance we elide m->Owner==Self check in unlock. 1927 // A perfectly viable alternative is to elide the owner check except when 1928 // Xcheck:jni is enabled. 1929 1930 void MacroAssembler::fast_unlock(Register objReg, Register boxReg, Register tmpReg, bool use_rtm) { 1931 assert(boxReg == rax, ""); 1932 assert_different_registers(objReg, boxReg, tmpReg); 1933 1934 if (EmitSync & 4) { 1935 // Disable - inhibit all inlining. Force control through the slow-path 1936 cmpptr (rsp, 0); 1937 } else { 1938 Label DONE_LABEL, Stacked, CheckSucc; 1939 1940 // Critically, the biased locking test must have precedence over 1941 // and appear before the (box->dhw == 0) recursive stack-lock test. 1942 if (UseBiasedLocking && !UseOptoBiasInlining) { 1943 biased_locking_exit(objReg, tmpReg, DONE_LABEL); 1944 } 1945 1946 #if INCLUDE_RTM_OPT 1947 if (UseRTMForStackLocks && use_rtm) { 1948 assert(!UseBiasedLocking, "Biased locking is not supported with RTM locking"); 1949 Label L_regular_unlock; 1950 movptr(tmpReg, Address(objReg, 0)); // fetch markword 1951 andptr(tmpReg, markOopDesc::biased_lock_mask_in_place); // look at 3 lock bits 1952 cmpptr(tmpReg, markOopDesc::unlocked_value); // bits = 001 unlocked 1953 jccb(Assembler::notEqual, L_regular_unlock); // if !HLE RegularLock 1954 xend(); // otherwise end... 1955 jmp(DONE_LABEL); // ... and we're done 1956 bind(L_regular_unlock); 1957 } 1958 #endif 1959 1960 cmpptr(Address(boxReg, 0), (int32_t)NULL_WORD); // Examine the displaced header 1961 jcc (Assembler::zero, DONE_LABEL); // 0 indicates recursive stack-lock 1962 movptr(tmpReg, Address(objReg, 0)); // Examine the object's markword 1963 testptr(tmpReg, markOopDesc::monitor_value); // Inflated? 1964 jccb (Assembler::zero, Stacked); 1965 1966 // It's inflated. 1967 #if INCLUDE_RTM_OPT 1968 if (use_rtm) { 1969 Label L_regular_inflated_unlock; 1970 int owner_offset = OM_OFFSET_NO_MONITOR_VALUE_TAG(owner); 1971 movptr(boxReg, Address(tmpReg, owner_offset)); 1972 testptr(boxReg, boxReg); 1973 jccb(Assembler::notZero, L_regular_inflated_unlock); 1974 xend(); 1975 jmpb(DONE_LABEL); 1976 bind(L_regular_inflated_unlock); 1977 } 1978 #endif 1979 1980 // Despite our balanced locking property we still check that m->_owner == Self 1981 // as java routines or native JNI code called by this thread might 1982 // have released the lock. 1983 // Refer to the comments in synchronizer.cpp for how we might encode extra 1984 // state in _succ so we can avoid fetching EntryList|cxq. 1985 // 1986 // I'd like to add more cases in fast_lock() and fast_unlock() -- 1987 // such as recursive enter and exit -- but we have to be wary of 1988 // I$ bloat, T$ effects and BP$ effects. 1989 // 1990 // If there's no contention try a 1-0 exit. That is, exit without 1991 // a costly MEMBAR or CAS. See synchronizer.cpp for details on how 1992 // we detect and recover from the race that the 1-0 exit admits. 1993 // 1994 // Conceptually Fast_Unlock() must execute a STST|LDST "release" barrier 1995 // before it STs null into _owner, releasing the lock. Updates 1996 // to data protected by the critical section must be visible before 1997 // we drop the lock (and thus before any other thread could acquire 1998 // the lock and observe the fields protected by the lock). 1999 // IA32's memory-model is SPO, so STs are ordered with respect to 2000 // each other and there's no need for an explicit barrier (fence). 2001 // See also http://gee.cs.oswego.edu/dl/jmm/cookbook.html. 2002 #ifndef _LP64 2003 get_thread (boxReg); 2004 if ((EmitSync & 4096) && VM_Version::supports_3dnow_prefetch() && os::is_MP()) { 2005 // prefetchw [ebx + Offset(_owner)-2] 2006 prefetchw(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2007 } 2008 2009 // Note that we could employ various encoding schemes to reduce 2010 // the number of loads below (currently 4) to just 2 or 3. 2011 // Refer to the comments in synchronizer.cpp. 2012 // In practice the chain of fetches doesn't seem to impact performance, however. 2013 xorptr(boxReg, boxReg); 2014 if ((EmitSync & 65536) == 0 && (EmitSync & 256)) { 2015 // Attempt to reduce branch density - AMD's branch predictor. 2016 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2017 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2018 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2019 jccb (Assembler::notZero, DONE_LABEL); 2020 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2021 jmpb (DONE_LABEL); 2022 } else { 2023 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2024 jccb (Assembler::notZero, DONE_LABEL); 2025 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2026 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2027 jccb (Assembler::notZero, CheckSucc); 2028 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2029 jmpb (DONE_LABEL); 2030 } 2031 2032 // The Following code fragment (EmitSync & 65536) improves the performance of 2033 // contended applications and contended synchronization microbenchmarks. 2034 // Unfortunately the emission of the code - even though not executed - causes regressions 2035 // in scimark and jetstream, evidently because of $ effects. Replacing the code 2036 // with an equal number of never-executed NOPs results in the same regression. 2037 // We leave it off by default. 2038 2039 if ((EmitSync & 65536) != 0) { 2040 Label LSuccess, LGoSlowPath ; 2041 2042 bind (CheckSucc); 2043 2044 // Optional pre-test ... it's safe to elide this 2045 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2046 jccb(Assembler::zero, LGoSlowPath); 2047 2048 // We have a classic Dekker-style idiom: 2049 // ST m->_owner = 0 ; MEMBAR; LD m->_succ 2050 // There are a number of ways to implement the barrier: 2051 // (1) lock:andl &m->_owner, 0 2052 // is fast, but mask doesn't currently support the "ANDL M,IMM32" form. 2053 // LOCK: ANDL [ebx+Offset(_Owner)-2], 0 2054 // Encodes as 81 31 OFF32 IMM32 or 83 63 OFF8 IMM8 2055 // (2) If supported, an explicit MFENCE is appealing. 2056 // In older IA32 processors MFENCE is slower than lock:add or xchg 2057 // particularly if the write-buffer is full as might be the case if 2058 // if stores closely precede the fence or fence-equivalent instruction. 2059 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 2060 // as the situation has changed with Nehalem and Shanghai. 2061 // (3) In lieu of an explicit fence, use lock:addl to the top-of-stack 2062 // The $lines underlying the top-of-stack should be in M-state. 2063 // The locked add instruction is serializing, of course. 2064 // (4) Use xchg, which is serializing 2065 // mov boxReg, 0; xchgl boxReg, [tmpReg + Offset(_owner)-2] also works 2066 // (5) ST m->_owner = 0 and then execute lock:orl &m->_succ, 0. 2067 // The integer condition codes will tell us if succ was 0. 2068 // Since _succ and _owner should reside in the same $line and 2069 // we just stored into _owner, it's likely that the $line 2070 // remains in M-state for the lock:orl. 2071 // 2072 // We currently use (3), although it's likely that switching to (2) 2073 // is correct for the future. 2074 2075 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), NULL_WORD); 2076 if (os::is_MP()) { 2077 lock(); addptr(Address(rsp, 0), 0); 2078 } 2079 // Ratify _succ remains non-null 2080 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), 0); 2081 jccb (Assembler::notZero, LSuccess); 2082 2083 xorptr(boxReg, boxReg); // box is really EAX 2084 if (os::is_MP()) { lock(); } 2085 cmpxchgptr(rsp, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2086 jccb (Assembler::notEqual, LSuccess); 2087 // Since we're low on registers we installed rsp as a placeholding in _owner. 2088 // Now install Self over rsp. This is safe as we're transitioning from 2089 // non-null to non=null 2090 get_thread (boxReg); 2091 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), boxReg); 2092 // Intentional fall-through into LGoSlowPath ... 2093 2094 bind (LGoSlowPath); 2095 orptr(boxReg, 1); // set ICC.ZF=0 to indicate failure 2096 jmpb (DONE_LABEL); 2097 2098 bind (LSuccess); 2099 xorptr(boxReg, boxReg); // set ICC.ZF=1 to indicate success 2100 jmpb (DONE_LABEL); 2101 } 2102 2103 bind (Stacked); 2104 // It's not inflated and it's not recursively stack-locked and it's not biased. 2105 // It must be stack-locked. 2106 // Try to reset the header to displaced header. 2107 // The "box" value on the stack is stable, so we can reload 2108 // and be assured we observe the same value as above. 2109 movptr(tmpReg, Address(boxReg, 0)); 2110 if (os::is_MP()) { 2111 lock(); 2112 } 2113 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2114 // Intention fall-thru into DONE_LABEL 2115 2116 // DONE_LABEL is a hot target - we'd really like to place it at the 2117 // start of cache line by padding with NOPs. 2118 // See the AMD and Intel software optimization manuals for the 2119 // most efficient "long" NOP encodings. 2120 // Unfortunately none of our alignment mechanisms suffice. 2121 if ((EmitSync & 65536) == 0) { 2122 bind (CheckSucc); 2123 } 2124 #else // _LP64 2125 // It's inflated 2126 if (EmitSync & 1024) { 2127 // Emit code to check that _owner == Self 2128 // We could fold the _owner test into subsequent code more efficiently 2129 // than using a stand-alone check, but since _owner checking is off by 2130 // default we don't bother. We also might consider predicating the 2131 // _owner==Self check on Xcheck:jni or running on a debug build. 2132 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2133 xorptr(boxReg, r15_thread); 2134 } else { 2135 xorptr(boxReg, boxReg); 2136 } 2137 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(recursions))); 2138 jccb (Assembler::notZero, DONE_LABEL); 2139 movptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(cxq))); 2140 orptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(EntryList))); 2141 jccb (Assembler::notZero, CheckSucc); 2142 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2143 jmpb (DONE_LABEL); 2144 2145 if ((EmitSync & 65536) == 0) { 2146 // Try to avoid passing control into the slow_path ... 2147 Label LSuccess, LGoSlowPath ; 2148 bind (CheckSucc); 2149 2150 // The following optional optimization can be elided if necessary 2151 // Effectively: if (succ == null) goto SlowPath 2152 // The code reduces the window for a race, however, 2153 // and thus benefits performance. 2154 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2155 jccb (Assembler::zero, LGoSlowPath); 2156 2157 if ((EmitSync & 16) && os::is_MP()) { 2158 orptr(boxReg, boxReg); 2159 xchgptr(boxReg, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2160 } else { 2161 movptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner)), (int32_t)NULL_WORD); 2162 if (os::is_MP()) { 2163 // Memory barrier/fence 2164 // Dekker pivot point -- fulcrum : ST Owner; MEMBAR; LD Succ 2165 // Instead of MFENCE we use a dummy locked add of 0 to the top-of-stack. 2166 // This is faster on Nehalem and AMD Shanghai/Barcelona. 2167 // See https://blogs.oracle.com/dave/entry/instruction_selection_for_volatile_fences 2168 // We might also restructure (ST Owner=0;barrier;LD _Succ) to 2169 // (mov box,0; xchgq box, &m->Owner; LD _succ) . 2170 lock(); addl(Address(rsp, 0), 0); 2171 } 2172 } 2173 cmpptr(Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(succ)), (int32_t)NULL_WORD); 2174 jccb (Assembler::notZero, LSuccess); 2175 2176 // Rare inopportune interleaving - race. 2177 // The successor vanished in the small window above. 2178 // The lock is contended -- (cxq|EntryList) != null -- and there's no apparent successor. 2179 // We need to ensure progress and succession. 2180 // Try to reacquire the lock. 2181 // If that fails then the new owner is responsible for succession and this 2182 // thread needs to take no further action and can exit via the fast path (success). 2183 // If the re-acquire succeeds then pass control into the slow path. 2184 // As implemented, this latter mode is horrible because we generated more 2185 // coherence traffic on the lock *and* artifically extended the critical section 2186 // length while by virtue of passing control into the slow path. 2187 2188 // box is really RAX -- the following CMPXCHG depends on that binding 2189 // cmpxchg R,[M] is equivalent to rax = CAS(M,rax,R) 2190 movptr(boxReg, (int32_t)NULL_WORD); 2191 if (os::is_MP()) { lock(); } 2192 cmpxchgptr(r15_thread, Address(tmpReg, OM_OFFSET_NO_MONITOR_VALUE_TAG(owner))); 2193 jccb (Assembler::notEqual, LSuccess); 2194 // Intentional fall-through into slow-path 2195 2196 bind (LGoSlowPath); 2197 orl (boxReg, 1); // set ICC.ZF=0 to indicate failure 2198 jmpb (DONE_LABEL); 2199 2200 bind (LSuccess); 2201 testl (boxReg, 0); // set ICC.ZF=1 to indicate success 2202 jmpb (DONE_LABEL); 2203 } 2204 2205 bind (Stacked); 2206 movptr(tmpReg, Address (boxReg, 0)); // re-fetch 2207 if (os::is_MP()) { lock(); } 2208 cmpxchgptr(tmpReg, Address(objReg, 0)); // Uses RAX which is box 2209 2210 if (EmitSync & 65536) { 2211 bind (CheckSucc); 2212 } 2213 #endif 2214 bind(DONE_LABEL); 2215 } 2216 } 2217 #endif // COMPILER2 2218 2219 void MacroAssembler::c2bool(Register x) { 2220 // implements x == 0 ? 0 : 1 2221 // note: must only look at least-significant byte of x 2222 // since C-style booleans are stored in one byte 2223 // only! (was bug) 2224 andl(x, 0xFF); 2225 setb(Assembler::notZero, x); 2226 } 2227 2228 // Wouldn't need if AddressLiteral version had new name 2229 void MacroAssembler::call(Label& L, relocInfo::relocType rtype) { 2230 Assembler::call(L, rtype); 2231 } 2232 2233 void MacroAssembler::call(Register entry) { 2234 Assembler::call(entry); 2235 } 2236 2237 void MacroAssembler::call(AddressLiteral entry) { 2238 if (reachable(entry)) { 2239 Assembler::call_literal(entry.target(), entry.rspec()); 2240 } else { 2241 lea(rscratch1, entry); 2242 Assembler::call(rscratch1); 2243 } 2244 } 2245 2246 void MacroAssembler::ic_call(address entry) { 2247 RelocationHolder rh = virtual_call_Relocation::spec(pc()); 2248 movptr(rax, (intptr_t)Universe::non_oop_word()); 2249 call(AddressLiteral(entry, rh)); 2250 } 2251 2252 // Implementation of call_VM versions 2253 2254 void MacroAssembler::call_VM(Register oop_result, 2255 address entry_point, 2256 bool check_exceptions) { 2257 Label C, E; 2258 call(C, relocInfo::none); 2259 jmp(E); 2260 2261 bind(C); 2262 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 2263 ret(0); 2264 2265 bind(E); 2266 } 2267 2268 void MacroAssembler::call_VM(Register oop_result, 2269 address entry_point, 2270 Register arg_1, 2271 bool check_exceptions) { 2272 Label C, E; 2273 call(C, relocInfo::none); 2274 jmp(E); 2275 2276 bind(C); 2277 pass_arg1(this, arg_1); 2278 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 2279 ret(0); 2280 2281 bind(E); 2282 } 2283 2284 void MacroAssembler::call_VM(Register oop_result, 2285 address entry_point, 2286 Register arg_1, 2287 Register arg_2, 2288 bool check_exceptions) { 2289 Label C, E; 2290 call(C, relocInfo::none); 2291 jmp(E); 2292 2293 bind(C); 2294 2295 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2296 2297 pass_arg2(this, arg_2); 2298 pass_arg1(this, arg_1); 2299 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 2300 ret(0); 2301 2302 bind(E); 2303 } 2304 2305 void MacroAssembler::call_VM(Register oop_result, 2306 address entry_point, 2307 Register arg_1, 2308 Register arg_2, 2309 Register arg_3, 2310 bool check_exceptions) { 2311 Label C, E; 2312 call(C, relocInfo::none); 2313 jmp(E); 2314 2315 bind(C); 2316 2317 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 2318 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 2319 pass_arg3(this, arg_3); 2320 2321 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2322 pass_arg2(this, arg_2); 2323 2324 pass_arg1(this, arg_1); 2325 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 2326 ret(0); 2327 2328 bind(E); 2329 } 2330 2331 void MacroAssembler::call_VM(Register oop_result, 2332 Register last_java_sp, 2333 address entry_point, 2334 int number_of_arguments, 2335 bool check_exceptions) { 2336 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 2337 call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 2338 } 2339 2340 void MacroAssembler::call_VM(Register oop_result, 2341 Register last_java_sp, 2342 address entry_point, 2343 Register arg_1, 2344 bool check_exceptions) { 2345 pass_arg1(this, arg_1); 2346 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 2347 } 2348 2349 void MacroAssembler::call_VM(Register oop_result, 2350 Register last_java_sp, 2351 address entry_point, 2352 Register arg_1, 2353 Register arg_2, 2354 bool check_exceptions) { 2355 2356 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2357 pass_arg2(this, arg_2); 2358 pass_arg1(this, arg_1); 2359 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 2360 } 2361 2362 void MacroAssembler::call_VM(Register oop_result, 2363 Register last_java_sp, 2364 address entry_point, 2365 Register arg_1, 2366 Register arg_2, 2367 Register arg_3, 2368 bool check_exceptions) { 2369 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 2370 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 2371 pass_arg3(this, arg_3); 2372 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2373 pass_arg2(this, arg_2); 2374 pass_arg1(this, arg_1); 2375 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 2376 } 2377 2378 void MacroAssembler::super_call_VM(Register oop_result, 2379 Register last_java_sp, 2380 address entry_point, 2381 int number_of_arguments, 2382 bool check_exceptions) { 2383 Register thread = LP64_ONLY(r15_thread) NOT_LP64(noreg); 2384 MacroAssembler::call_VM_base(oop_result, thread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 2385 } 2386 2387 void MacroAssembler::super_call_VM(Register oop_result, 2388 Register last_java_sp, 2389 address entry_point, 2390 Register arg_1, 2391 bool check_exceptions) { 2392 pass_arg1(this, arg_1); 2393 super_call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 2394 } 2395 2396 void MacroAssembler::super_call_VM(Register oop_result, 2397 Register last_java_sp, 2398 address entry_point, 2399 Register arg_1, 2400 Register arg_2, 2401 bool check_exceptions) { 2402 2403 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2404 pass_arg2(this, arg_2); 2405 pass_arg1(this, arg_1); 2406 super_call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 2407 } 2408 2409 void MacroAssembler::super_call_VM(Register oop_result, 2410 Register last_java_sp, 2411 address entry_point, 2412 Register arg_1, 2413 Register arg_2, 2414 Register arg_3, 2415 bool check_exceptions) { 2416 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 2417 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 2418 pass_arg3(this, arg_3); 2419 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2420 pass_arg2(this, arg_2); 2421 pass_arg1(this, arg_1); 2422 super_call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 2423 } 2424 2425 void MacroAssembler::call_VM_base(Register oop_result, 2426 Register java_thread, 2427 Register last_java_sp, 2428 address entry_point, 2429 int number_of_arguments, 2430 bool check_exceptions) { 2431 // determine java_thread register 2432 if (!java_thread->is_valid()) { 2433 #ifdef _LP64 2434 java_thread = r15_thread; 2435 #else 2436 java_thread = rdi; 2437 get_thread(java_thread); 2438 #endif // LP64 2439 } 2440 // determine last_java_sp register 2441 if (!last_java_sp->is_valid()) { 2442 last_java_sp = rsp; 2443 } 2444 // debugging support 2445 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 2446 LP64_ONLY(assert(java_thread == r15_thread, "unexpected register")); 2447 #ifdef ASSERT 2448 // TraceBytecodes does not use r12 but saves it over the call, so don't verify 2449 // r12 is the heapbase. 2450 LP64_ONLY(if ((UseCompressedOops || UseCompressedClassPointers) && !TraceBytecodes) verify_heapbase("call_VM_base: heap base corrupted?");) 2451 #endif // ASSERT 2452 2453 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 2454 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 2455 2456 // push java thread (becomes first argument of C function) 2457 2458 NOT_LP64(push(java_thread); number_of_arguments++); 2459 LP64_ONLY(mov(c_rarg0, r15_thread)); 2460 2461 // set last Java frame before call 2462 assert(last_java_sp != rbp, "can't use ebp/rbp"); 2463 2464 // Only interpreter should have to set fp 2465 set_last_Java_frame(java_thread, last_java_sp, rbp, NULL); 2466 2467 // do the call, remove parameters 2468 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments); 2469 2470 // restore the thread (cannot use the pushed argument since arguments 2471 // may be overwritten by C code generated by an optimizing compiler); 2472 // however can use the register value directly if it is callee saved. 2473 if (LP64_ONLY(true ||) java_thread == rdi || java_thread == rsi) { 2474 // rdi & rsi (also r15) are callee saved -> nothing to do 2475 #ifdef ASSERT 2476 guarantee(java_thread != rax, "change this code"); 2477 push(rax); 2478 { Label L; 2479 get_thread(rax); 2480 cmpptr(java_thread, rax); 2481 jcc(Assembler::equal, L); 2482 STOP("MacroAssembler::call_VM_base: rdi not callee saved?"); 2483 bind(L); 2484 } 2485 pop(rax); 2486 #endif 2487 } else { 2488 get_thread(java_thread); 2489 } 2490 // reset last Java frame 2491 // Only interpreter should have to clear fp 2492 reset_last_Java_frame(java_thread, true, false); 2493 2494 #ifndef CC_INTERP 2495 // C++ interp handles this in the interpreter 2496 check_and_handle_popframe(java_thread); 2497 check_and_handle_earlyret(java_thread); 2498 #endif /* CC_INTERP */ 2499 2500 if (check_exceptions) { 2501 // check for pending exceptions (java_thread is set upon return) 2502 cmpptr(Address(java_thread, Thread::pending_exception_offset()), (int32_t) NULL_WORD); 2503 #ifndef _LP64 2504 jump_cc(Assembler::notEqual, 2505 RuntimeAddress(StubRoutines::forward_exception_entry())); 2506 #else 2507 // This used to conditionally jump to forward_exception however it is 2508 // possible if we relocate that the branch will not reach. So we must jump 2509 // around so we can always reach 2510 2511 Label ok; 2512 jcc(Assembler::equal, ok); 2513 jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 2514 bind(ok); 2515 #endif // LP64 2516 } 2517 2518 // get oop result if there is one and reset the value in the thread 2519 if (oop_result->is_valid()) { 2520 get_vm_result(oop_result, java_thread); 2521 } 2522 } 2523 2524 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 2525 2526 // Calculate the value for last_Java_sp 2527 // somewhat subtle. call_VM does an intermediate call 2528 // which places a return address on the stack just under the 2529 // stack pointer as the user finsihed with it. This allows 2530 // use to retrieve last_Java_pc from last_Java_sp[-1]. 2531 // On 32bit we then have to push additional args on the stack to accomplish 2532 // the actual requested call. On 64bit call_VM only can use register args 2533 // so the only extra space is the return address that call_VM created. 2534 // This hopefully explains the calculations here. 2535 2536 #ifdef _LP64 2537 // We've pushed one address, correct last_Java_sp 2538 lea(rax, Address(rsp, wordSize)); 2539 #else 2540 lea(rax, Address(rsp, (1 + number_of_arguments) * wordSize)); 2541 #endif // LP64 2542 2543 call_VM_base(oop_result, noreg, rax, entry_point, number_of_arguments, check_exceptions); 2544 2545 } 2546 2547 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 2548 call_VM_leaf_base(entry_point, number_of_arguments); 2549 } 2550 2551 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 2552 pass_arg0(this, arg_0); 2553 call_VM_leaf(entry_point, 1); 2554 } 2555 2556 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 2557 2558 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 2559 pass_arg1(this, arg_1); 2560 pass_arg0(this, arg_0); 2561 call_VM_leaf(entry_point, 2); 2562 } 2563 2564 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 2565 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 2566 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2567 pass_arg2(this, arg_2); 2568 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 2569 pass_arg1(this, arg_1); 2570 pass_arg0(this, arg_0); 2571 call_VM_leaf(entry_point, 3); 2572 } 2573 2574 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 2575 pass_arg0(this, arg_0); 2576 MacroAssembler::call_VM_leaf_base(entry_point, 1); 2577 } 2578 2579 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 2580 2581 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 2582 pass_arg1(this, arg_1); 2583 pass_arg0(this, arg_0); 2584 MacroAssembler::call_VM_leaf_base(entry_point, 2); 2585 } 2586 2587 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 2588 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 2589 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2590 pass_arg2(this, arg_2); 2591 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 2592 pass_arg1(this, arg_1); 2593 pass_arg0(this, arg_0); 2594 MacroAssembler::call_VM_leaf_base(entry_point, 3); 2595 } 2596 2597 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 2598 LP64_ONLY(assert(arg_0 != c_rarg3, "smashed arg")); 2599 LP64_ONLY(assert(arg_1 != c_rarg3, "smashed arg")); 2600 LP64_ONLY(assert(arg_2 != c_rarg3, "smashed arg")); 2601 pass_arg3(this, arg_3); 2602 LP64_ONLY(assert(arg_0 != c_rarg2, "smashed arg")); 2603 LP64_ONLY(assert(arg_1 != c_rarg2, "smashed arg")); 2604 pass_arg2(this, arg_2); 2605 LP64_ONLY(assert(arg_0 != c_rarg1, "smashed arg")); 2606 pass_arg1(this, arg_1); 2607 pass_arg0(this, arg_0); 2608 MacroAssembler::call_VM_leaf_base(entry_point, 4); 2609 } 2610 2611 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 2612 movptr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 2613 movptr(Address(java_thread, JavaThread::vm_result_offset()), NULL_WORD); 2614 verify_oop(oop_result, "broken oop in call_VM_base"); 2615 } 2616 2617 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 2618 movptr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 2619 movptr(Address(java_thread, JavaThread::vm_result_2_offset()), NULL_WORD); 2620 } 2621 2622 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { 2623 } 2624 2625 void MacroAssembler::check_and_handle_popframe(Register java_thread) { 2626 } 2627 2628 void MacroAssembler::cmp32(AddressLiteral src1, int32_t imm) { 2629 if (reachable(src1)) { 2630 cmpl(as_Address(src1), imm); 2631 } else { 2632 lea(rscratch1, src1); 2633 cmpl(Address(rscratch1, 0), imm); 2634 } 2635 } 2636 2637 void MacroAssembler::cmp32(Register src1, AddressLiteral src2) { 2638 assert(!src2.is_lval(), "use cmpptr"); 2639 if (reachable(src2)) { 2640 cmpl(src1, as_Address(src2)); 2641 } else { 2642 lea(rscratch1, src2); 2643 cmpl(src1, Address(rscratch1, 0)); 2644 } 2645 } 2646 2647 void MacroAssembler::cmp32(Register src1, int32_t imm) { 2648 Assembler::cmpl(src1, imm); 2649 } 2650 2651 void MacroAssembler::cmp32(Register src1, Address src2) { 2652 Assembler::cmpl(src1, src2); 2653 } 2654 2655 void MacroAssembler::cmpsd2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 2656 ucomisd(opr1, opr2); 2657 2658 Label L; 2659 if (unordered_is_less) { 2660 movl(dst, -1); 2661 jcc(Assembler::parity, L); 2662 jcc(Assembler::below , L); 2663 movl(dst, 0); 2664 jcc(Assembler::equal , L); 2665 increment(dst); 2666 } else { // unordered is greater 2667 movl(dst, 1); 2668 jcc(Assembler::parity, L); 2669 jcc(Assembler::above , L); 2670 movl(dst, 0); 2671 jcc(Assembler::equal , L); 2672 decrementl(dst); 2673 } 2674 bind(L); 2675 } 2676 2677 void MacroAssembler::cmpss2int(XMMRegister opr1, XMMRegister opr2, Register dst, bool unordered_is_less) { 2678 ucomiss(opr1, opr2); 2679 2680 Label L; 2681 if (unordered_is_less) { 2682 movl(dst, -1); 2683 jcc(Assembler::parity, L); 2684 jcc(Assembler::below , L); 2685 movl(dst, 0); 2686 jcc(Assembler::equal , L); 2687 increment(dst); 2688 } else { // unordered is greater 2689 movl(dst, 1); 2690 jcc(Assembler::parity, L); 2691 jcc(Assembler::above , L); 2692 movl(dst, 0); 2693 jcc(Assembler::equal , L); 2694 decrementl(dst); 2695 } 2696 bind(L); 2697 } 2698 2699 2700 void MacroAssembler::cmp8(AddressLiteral src1, int imm) { 2701 if (reachable(src1)) { 2702 cmpb(as_Address(src1), imm); 2703 } else { 2704 lea(rscratch1, src1); 2705 cmpb(Address(rscratch1, 0), imm); 2706 } 2707 } 2708 2709 void MacroAssembler::cmpptr(Register src1, AddressLiteral src2) { 2710 #ifdef _LP64 2711 if (src2.is_lval()) { 2712 movptr(rscratch1, src2); 2713 Assembler::cmpq(src1, rscratch1); 2714 } else if (reachable(src2)) { 2715 cmpq(src1, as_Address(src2)); 2716 } else { 2717 lea(rscratch1, src2); 2718 Assembler::cmpq(src1, Address(rscratch1, 0)); 2719 } 2720 #else 2721 if (src2.is_lval()) { 2722 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 2723 } else { 2724 cmpl(src1, as_Address(src2)); 2725 } 2726 #endif // _LP64 2727 } 2728 2729 void MacroAssembler::cmpptr(Address src1, AddressLiteral src2) { 2730 assert(src2.is_lval(), "not a mem-mem compare"); 2731 #ifdef _LP64 2732 // moves src2's literal address 2733 movptr(rscratch1, src2); 2734 Assembler::cmpq(src1, rscratch1); 2735 #else 2736 cmp_literal32(src1, (int32_t) src2.target(), src2.rspec()); 2737 #endif // _LP64 2738 } 2739 2740 void MacroAssembler::locked_cmpxchgptr(Register reg, AddressLiteral adr) { 2741 if (reachable(adr)) { 2742 if (os::is_MP()) 2743 lock(); 2744 cmpxchgptr(reg, as_Address(adr)); 2745 } else { 2746 lea(rscratch1, adr); 2747 if (os::is_MP()) 2748 lock(); 2749 cmpxchgptr(reg, Address(rscratch1, 0)); 2750 } 2751 } 2752 2753 void MacroAssembler::cmpxchgptr(Register reg, Address adr) { 2754 LP64_ONLY(cmpxchgq(reg, adr)) NOT_LP64(cmpxchgl(reg, adr)); 2755 } 2756 2757 void MacroAssembler::comisd(XMMRegister dst, AddressLiteral src) { 2758 if (reachable(src)) { 2759 Assembler::comisd(dst, as_Address(src)); 2760 } else { 2761 lea(rscratch1, src); 2762 Assembler::comisd(dst, Address(rscratch1, 0)); 2763 } 2764 } 2765 2766 void MacroAssembler::comiss(XMMRegister dst, AddressLiteral src) { 2767 if (reachable(src)) { 2768 Assembler::comiss(dst, as_Address(src)); 2769 } else { 2770 lea(rscratch1, src); 2771 Assembler::comiss(dst, Address(rscratch1, 0)); 2772 } 2773 } 2774 2775 2776 void MacroAssembler::cond_inc32(Condition cond, AddressLiteral counter_addr) { 2777 Condition negated_cond = negate_condition(cond); 2778 Label L; 2779 jcc(negated_cond, L); 2780 pushf(); // Preserve flags 2781 atomic_incl(counter_addr); 2782 popf(); 2783 bind(L); 2784 } 2785 2786 int MacroAssembler::corrected_idivl(Register reg) { 2787 // Full implementation of Java idiv and irem; checks for 2788 // special case as described in JVM spec., p.243 & p.271. 2789 // The function returns the (pc) offset of the idivl 2790 // instruction - may be needed for implicit exceptions. 2791 // 2792 // normal case special case 2793 // 2794 // input : rax,: dividend min_int 2795 // reg: divisor (may not be rax,/rdx) -1 2796 // 2797 // output: rax,: quotient (= rax, idiv reg) min_int 2798 // rdx: remainder (= rax, irem reg) 0 2799 assert(reg != rax && reg != rdx, "reg cannot be rax, or rdx register"); 2800 const int min_int = 0x80000000; 2801 Label normal_case, special_case; 2802 2803 // check for special case 2804 cmpl(rax, min_int); 2805 jcc(Assembler::notEqual, normal_case); 2806 xorl(rdx, rdx); // prepare rdx for possible special case (where remainder = 0) 2807 cmpl(reg, -1); 2808 jcc(Assembler::equal, special_case); 2809 2810 // handle normal case 2811 bind(normal_case); 2812 cdql(); 2813 int idivl_offset = offset(); 2814 idivl(reg); 2815 2816 // normal and special case exit 2817 bind(special_case); 2818 2819 return idivl_offset; 2820 } 2821 2822 2823 2824 void MacroAssembler::decrementl(Register reg, int value) { 2825 if (value == min_jint) {subl(reg, value) ; return; } 2826 if (value < 0) { incrementl(reg, -value); return; } 2827 if (value == 0) { ; return; } 2828 if (value == 1 && UseIncDec) { decl(reg) ; return; } 2829 /* else */ { subl(reg, value) ; return; } 2830 } 2831 2832 void MacroAssembler::decrementl(Address dst, int value) { 2833 if (value == min_jint) {subl(dst, value) ; return; } 2834 if (value < 0) { incrementl(dst, -value); return; } 2835 if (value == 0) { ; return; } 2836 if (value == 1 && UseIncDec) { decl(dst) ; return; } 2837 /* else */ { subl(dst, value) ; return; } 2838 } 2839 2840 void MacroAssembler::division_with_shift (Register reg, int shift_value) { 2841 assert (shift_value > 0, "illegal shift value"); 2842 Label _is_positive; 2843 testl (reg, reg); 2844 jcc (Assembler::positive, _is_positive); 2845 int offset = (1 << shift_value) - 1 ; 2846 2847 if (offset == 1) { 2848 incrementl(reg); 2849 } else { 2850 addl(reg, offset); 2851 } 2852 2853 bind (_is_positive); 2854 sarl(reg, shift_value); 2855 } 2856 2857 void MacroAssembler::divsd(XMMRegister dst, AddressLiteral src) { 2858 if (reachable(src)) { 2859 Assembler::divsd(dst, as_Address(src)); 2860 } else { 2861 lea(rscratch1, src); 2862 Assembler::divsd(dst, Address(rscratch1, 0)); 2863 } 2864 } 2865 2866 void MacroAssembler::divss(XMMRegister dst, AddressLiteral src) { 2867 if (reachable(src)) { 2868 Assembler::divss(dst, as_Address(src)); 2869 } else { 2870 lea(rscratch1, src); 2871 Assembler::divss(dst, Address(rscratch1, 0)); 2872 } 2873 } 2874 2875 // !defined(COMPILER2) is because of stupid core builds 2876 #if !defined(_LP64) || defined(COMPILER1) || !defined(COMPILER2) 2877 void MacroAssembler::empty_FPU_stack() { 2878 if (VM_Version::supports_mmx()) { 2879 emms(); 2880 } else { 2881 for (int i = 8; i-- > 0; ) ffree(i); 2882 } 2883 } 2884 #endif // !LP64 || C1 || !C2 2885 2886 2887 // Defines obj, preserves var_size_in_bytes 2888 void MacroAssembler::eden_allocate(Register obj, 2889 Register var_size_in_bytes, 2890 int con_size_in_bytes, 2891 Register t1, 2892 Label& slow_case) { 2893 assert(obj == rax, "obj must be in rax, for cmpxchg"); 2894 assert_different_registers(obj, var_size_in_bytes, t1); 2895 if (!Universe::heap()->supports_inline_contig_alloc()) { 2896 jmp(slow_case); 2897 } else { 2898 Register end = t1; 2899 Label retry; 2900 bind(retry); 2901 ExternalAddress heap_top((address) Universe::heap()->top_addr()); 2902 movptr(obj, heap_top); 2903 if (var_size_in_bytes == noreg) { 2904 lea(end, Address(obj, con_size_in_bytes)); 2905 } else { 2906 lea(end, Address(obj, var_size_in_bytes, Address::times_1)); 2907 } 2908 // if end < obj then we wrapped around => object too long => slow case 2909 cmpptr(end, obj); 2910 jcc(Assembler::below, slow_case); 2911 cmpptr(end, ExternalAddress((address) Universe::heap()->end_addr())); 2912 jcc(Assembler::above, slow_case); 2913 // Compare obj with the top addr, and if still equal, store the new top addr in 2914 // end at the address of the top addr pointer. Sets ZF if was equal, and clears 2915 // it otherwise. Use lock prefix for atomicity on MPs. 2916 locked_cmpxchgptr(end, heap_top); 2917 jcc(Assembler::notEqual, retry); 2918 } 2919 } 2920 2921 void MacroAssembler::enter() { 2922 push(rbp); 2923 mov(rbp, rsp); 2924 } 2925 2926 // A 5 byte nop that is safe for patching (see patch_verified_entry) 2927 void MacroAssembler::fat_nop() { 2928 if (UseAddressNop) { 2929 addr_nop_5(); 2930 } else { 2931 emit_int8(0x26); // es: 2932 emit_int8(0x2e); // cs: 2933 emit_int8(0x64); // fs: 2934 emit_int8(0x65); // gs: 2935 emit_int8((unsigned char)0x90); 2936 } 2937 } 2938 2939 void MacroAssembler::fcmp(Register tmp) { 2940 fcmp(tmp, 1, true, true); 2941 } 2942 2943 void MacroAssembler::fcmp(Register tmp, int index, bool pop_left, bool pop_right) { 2944 assert(!pop_right || pop_left, "usage error"); 2945 if (VM_Version::supports_cmov()) { 2946 assert(tmp == noreg, "unneeded temp"); 2947 if (pop_left) { 2948 fucomip(index); 2949 } else { 2950 fucomi(index); 2951 } 2952 if (pop_right) { 2953 fpop(); 2954 } 2955 } else { 2956 assert(tmp != noreg, "need temp"); 2957 if (pop_left) { 2958 if (pop_right) { 2959 fcompp(); 2960 } else { 2961 fcomp(index); 2962 } 2963 } else { 2964 fcom(index); 2965 } 2966 // convert FPU condition into eflags condition via rax, 2967 save_rax(tmp); 2968 fwait(); fnstsw_ax(); 2969 sahf(); 2970 restore_rax(tmp); 2971 } 2972 // condition codes set as follows: 2973 // 2974 // CF (corresponds to C0) if x < y 2975 // PF (corresponds to C2) if unordered 2976 // ZF (corresponds to C3) if x = y 2977 } 2978 2979 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less) { 2980 fcmp2int(dst, unordered_is_less, 1, true, true); 2981 } 2982 2983 void MacroAssembler::fcmp2int(Register dst, bool unordered_is_less, int index, bool pop_left, bool pop_right) { 2984 fcmp(VM_Version::supports_cmov() ? noreg : dst, index, pop_left, pop_right); 2985 Label L; 2986 if (unordered_is_less) { 2987 movl(dst, -1); 2988 jcc(Assembler::parity, L); 2989 jcc(Assembler::below , L); 2990 movl(dst, 0); 2991 jcc(Assembler::equal , L); 2992 increment(dst); 2993 } else { // unordered is greater 2994 movl(dst, 1); 2995 jcc(Assembler::parity, L); 2996 jcc(Assembler::above , L); 2997 movl(dst, 0); 2998 jcc(Assembler::equal , L); 2999 decrementl(dst); 3000 } 3001 bind(L); 3002 } 3003 3004 void MacroAssembler::fld_d(AddressLiteral src) { 3005 fld_d(as_Address(src)); 3006 } 3007 3008 void MacroAssembler::fld_s(AddressLiteral src) { 3009 fld_s(as_Address(src)); 3010 } 3011 3012 void MacroAssembler::fld_x(AddressLiteral src) { 3013 Assembler::fld_x(as_Address(src)); 3014 } 3015 3016 void MacroAssembler::fldcw(AddressLiteral src) { 3017 Assembler::fldcw(as_Address(src)); 3018 } 3019 3020 void MacroAssembler::pow_exp_core_encoding() { 3021 // kills rax, rcx, rdx 3022 subptr(rsp,sizeof(jdouble)); 3023 // computes 2^X. Stack: X ... 3024 // f2xm1 computes 2^X-1 but only operates on -1<=X<=1. Get int(X) and 3025 // keep it on the thread's stack to compute 2^int(X) later 3026 // then compute 2^(X-int(X)) as (2^(X-int(X)-1+1) 3027 // final result is obtained with: 2^X = 2^int(X) * 2^(X-int(X)) 3028 fld_s(0); // Stack: X X ... 3029 frndint(); // Stack: int(X) X ... 3030 fsuba(1); // Stack: int(X) X-int(X) ... 3031 fistp_s(Address(rsp,0)); // move int(X) as integer to thread's stack. Stack: X-int(X) ... 3032 f2xm1(); // Stack: 2^(X-int(X))-1 ... 3033 fld1(); // Stack: 1 2^(X-int(X))-1 ... 3034 faddp(1); // Stack: 2^(X-int(X)) 3035 // computes 2^(int(X)): add exponent bias (1023) to int(X), then 3036 // shift int(X)+1023 to exponent position. 3037 // Exponent is limited to 11 bits if int(X)+1023 does not fit in 11 3038 // bits, set result to NaN. 0x000 and 0x7FF are reserved exponent 3039 // values so detect them and set result to NaN. 3040 movl(rax,Address(rsp,0)); 3041 movl(rcx, -2048); // 11 bit mask and valid NaN binary encoding 3042 addl(rax, 1023); 3043 movl(rdx,rax); 3044 shll(rax,20); 3045 // Check that 0 < int(X)+1023 < 2047. Otherwise set rax to NaN. 3046 addl(rdx,1); 3047 // Check that 1 < int(X)+1023+1 < 2048 3048 // in 3 steps: 3049 // 1- (int(X)+1023+1)&-2048 == 0 => 0 <= int(X)+1023+1 < 2048 3050 // 2- (int(X)+1023+1)&-2048 != 0 3051 // 3- (int(X)+1023+1)&-2048 != 1 3052 // Do 2- first because addl just updated the flags. 3053 cmov32(Assembler::equal,rax,rcx); 3054 cmpl(rdx,1); 3055 cmov32(Assembler::equal,rax,rcx); 3056 testl(rdx,rcx); 3057 cmov32(Assembler::notEqual,rax,rcx); 3058 movl(Address(rsp,4),rax); 3059 movl(Address(rsp,0),0); 3060 fmul_d(Address(rsp,0)); // Stack: 2^X ... 3061 addptr(rsp,sizeof(jdouble)); 3062 } 3063 3064 void MacroAssembler::increase_precision() { 3065 subptr(rsp, BytesPerWord); 3066 fnstcw(Address(rsp, 0)); 3067 movl(rax, Address(rsp, 0)); 3068 orl(rax, 0x300); 3069 push(rax); 3070 fldcw(Address(rsp, 0)); 3071 pop(rax); 3072 } 3073 3074 void MacroAssembler::restore_precision() { 3075 fldcw(Address(rsp, 0)); 3076 addptr(rsp, BytesPerWord); 3077 } 3078 3079 void MacroAssembler::fast_pow() { 3080 // computes X^Y = 2^(Y * log2(X)) 3081 // if fast computation is not possible, result is NaN. Requires 3082 // fallback from user of this macro. 3083 // increase precision for intermediate steps of the computation 3084 BLOCK_COMMENT("fast_pow {"); 3085 increase_precision(); 3086 fyl2x(); // Stack: (Y*log2(X)) ... 3087 pow_exp_core_encoding(); // Stack: exp(X) ... 3088 restore_precision(); 3089 BLOCK_COMMENT("} fast_pow"); 3090 } 3091 3092 void MacroAssembler::fast_exp() { 3093 // computes exp(X) = 2^(X * log2(e)) 3094 // if fast computation is not possible, result is NaN. Requires 3095 // fallback from user of this macro. 3096 // increase precision for intermediate steps of the computation 3097 increase_precision(); 3098 fldl2e(); // Stack: log2(e) X ... 3099 fmulp(1); // Stack: (X*log2(e)) ... 3100 pow_exp_core_encoding(); // Stack: exp(X) ... 3101 restore_precision(); 3102 } 3103 3104 void MacroAssembler::pow_or_exp(bool is_exp, int num_fpu_regs_in_use) { 3105 // kills rax, rcx, rdx 3106 // pow and exp needs 2 extra registers on the fpu stack. 3107 Label slow_case, done; 3108 Register tmp = noreg; 3109 if (!VM_Version::supports_cmov()) { 3110 // fcmp needs a temporary so preserve rdx, 3111 tmp = rdx; 3112 } 3113 Register tmp2 = rax; 3114 Register tmp3 = rcx; 3115 3116 if (is_exp) { 3117 // Stack: X 3118 fld_s(0); // duplicate argument for runtime call. Stack: X X 3119 fast_exp(); // Stack: exp(X) X 3120 fcmp(tmp, 0, false, false); // Stack: exp(X) X 3121 // exp(X) not equal to itself: exp(X) is NaN go to slow case. 3122 jcc(Assembler::parity, slow_case); 3123 // get rid of duplicate argument. Stack: exp(X) 3124 if (num_fpu_regs_in_use > 0) { 3125 fxch(); 3126 fpop(); 3127 } else { 3128 ffree(1); 3129 } 3130 jmp(done); 3131 } else { 3132 // Stack: X Y 3133 Label x_negative, y_not_2; 3134 3135 static double two = 2.0; 3136 ExternalAddress two_addr((address)&two); 3137 3138 // constant maybe too far on 64 bit 3139 lea(tmp2, two_addr); 3140 fld_d(Address(tmp2, 0)); // Stack: 2 X Y 3141 fcmp(tmp, 2, true, false); // Stack: X Y 3142 jcc(Assembler::parity, y_not_2); 3143 jcc(Assembler::notEqual, y_not_2); 3144 3145 fxch(); fpop(); // Stack: X 3146 fmul(0); // Stack: X*X 3147 3148 jmp(done); 3149 3150 bind(y_not_2); 3151 3152 fldz(); // Stack: 0 X Y 3153 fcmp(tmp, 1, true, false); // Stack: X Y 3154 jcc(Assembler::above, x_negative); 3155 3156 // X >= 0 3157 3158 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y 3159 fld_s(1); // Stack: X Y X Y 3160 fast_pow(); // Stack: X^Y X Y 3161 fcmp(tmp, 0, false, false); // Stack: X^Y X Y 3162 // X^Y not equal to itself: X^Y is NaN go to slow case. 3163 jcc(Assembler::parity, slow_case); 3164 // get rid of duplicate arguments. Stack: X^Y 3165 if (num_fpu_regs_in_use > 0) { 3166 fxch(); fpop(); 3167 fxch(); fpop(); 3168 } else { 3169 ffree(2); 3170 ffree(1); 3171 } 3172 jmp(done); 3173 3174 // X <= 0 3175 bind(x_negative); 3176 3177 fld_s(1); // Stack: Y X Y 3178 frndint(); // Stack: int(Y) X Y 3179 fcmp(tmp, 2, false, false); // Stack: int(Y) X Y 3180 jcc(Assembler::notEqual, slow_case); 3181 3182 subptr(rsp, 8); 3183 3184 // For X^Y, when X < 0, Y has to be an integer and the final 3185 // result depends on whether it's odd or even. We just checked 3186 // that int(Y) == Y. We move int(Y) to gp registers as a 64 bit 3187 // integer to test its parity. If int(Y) is huge and doesn't fit 3188 // in the 64 bit integer range, the integer indefinite value will 3189 // end up in the gp registers. Huge numbers are all even, the 3190 // integer indefinite number is even so it's fine. 3191 3192 #ifdef ASSERT 3193 // Let's check we don't end up with an integer indefinite number 3194 // when not expected. First test for huge numbers: check whether 3195 // int(Y)+1 == int(Y) which is true for very large numbers and 3196 // those are all even. A 64 bit integer is guaranteed to not 3197 // overflow for numbers where y+1 != y (when precision is set to 3198 // double precision). 3199 Label y_not_huge; 3200 3201 fld1(); // Stack: 1 int(Y) X Y 3202 fadd(1); // Stack: 1+int(Y) int(Y) X Y 3203 3204 #ifdef _LP64 3205 // trip to memory to force the precision down from double extended 3206 // precision 3207 fstp_d(Address(rsp, 0)); 3208 fld_d(Address(rsp, 0)); 3209 #endif 3210 3211 fcmp(tmp, 1, true, false); // Stack: int(Y) X Y 3212 #endif 3213 3214 // move int(Y) as 64 bit integer to thread's stack 3215 fistp_d(Address(rsp,0)); // Stack: X Y 3216 3217 #ifdef ASSERT 3218 jcc(Assembler::notEqual, y_not_huge); 3219 3220 // Y is huge so we know it's even. It may not fit in a 64 bit 3221 // integer and we don't want the debug code below to see the 3222 // integer indefinite value so overwrite int(Y) on the thread's 3223 // stack with 0. 3224 movl(Address(rsp, 0), 0); 3225 movl(Address(rsp, 4), 0); 3226 3227 bind(y_not_huge); 3228 #endif 3229 3230 fld_s(1); // duplicate arguments for runtime call. Stack: Y X Y 3231 fld_s(1); // Stack: X Y X Y 3232 fabs(); // Stack: abs(X) Y X Y 3233 fast_pow(); // Stack: abs(X)^Y X Y 3234 fcmp(tmp, 0, false, false); // Stack: abs(X)^Y X Y 3235 // abs(X)^Y not equal to itself: abs(X)^Y is NaN go to slow case. 3236 3237 pop(tmp2); 3238 NOT_LP64(pop(tmp3)); 3239 jcc(Assembler::parity, slow_case); 3240 3241 #ifdef ASSERT 3242 // Check that int(Y) is not integer indefinite value (int 3243 // overflow). Shouldn't happen because for values that would 3244 // overflow, 1+int(Y)==Y which was tested earlier. 3245 #ifndef _LP64 3246 { 3247 Label integer; 3248 testl(tmp2, tmp2); 3249 jcc(Assembler::notZero, integer); 3250 cmpl(tmp3, 0x80000000); 3251 jcc(Assembler::notZero, integer); 3252 STOP("integer indefinite value shouldn't be seen here"); 3253 bind(integer); 3254 } 3255 #else 3256 { 3257 Label integer; 3258 mov(tmp3, tmp2); // preserve tmp2 for parity check below 3259 shlq(tmp3, 1); 3260 jcc(Assembler::carryClear, integer); 3261 jcc(Assembler::notZero, integer); 3262 STOP("integer indefinite value shouldn't be seen here"); 3263 bind(integer); 3264 } 3265 #endif 3266 #endif 3267 3268 // get rid of duplicate arguments. Stack: X^Y 3269 if (num_fpu_regs_in_use > 0) { 3270 fxch(); fpop(); 3271 fxch(); fpop(); 3272 } else { 3273 ffree(2); 3274 ffree(1); 3275 } 3276 3277 testl(tmp2, 1); 3278 jcc(Assembler::zero, done); // X <= 0, Y even: X^Y = abs(X)^Y 3279 // X <= 0, Y even: X^Y = -abs(X)^Y 3280 3281 fchs(); // Stack: -abs(X)^Y Y 3282 jmp(done); 3283 } 3284 3285 // slow case: runtime call 3286 bind(slow_case); 3287 3288 fpop(); // pop incorrect result or int(Y) 3289 3290 fp_runtime_fallback(is_exp ? CAST_FROM_FN_PTR(address, SharedRuntime::dexp) : CAST_FROM_FN_PTR(address, SharedRuntime::dpow), 3291 is_exp ? 1 : 2, num_fpu_regs_in_use); 3292 3293 // Come here with result in F-TOS 3294 bind(done); 3295 } 3296 3297 void MacroAssembler::fpop() { 3298 ffree(); 3299 fincstp(); 3300 } 3301 3302 void MacroAssembler::fremr(Register tmp) { 3303 save_rax(tmp); 3304 { Label L; 3305 bind(L); 3306 fprem(); 3307 fwait(); fnstsw_ax(); 3308 #ifdef _LP64 3309 testl(rax, 0x400); 3310 jcc(Assembler::notEqual, L); 3311 #else 3312 sahf(); 3313 jcc(Assembler::parity, L); 3314 #endif // _LP64 3315 } 3316 restore_rax(tmp); 3317 // Result is in ST0. 3318 // Note: fxch & fpop to get rid of ST1 3319 // (otherwise FPU stack could overflow eventually) 3320 fxch(1); 3321 fpop(); 3322 } 3323 3324 3325 void MacroAssembler::incrementl(AddressLiteral dst) { 3326 if (reachable(dst)) { 3327 incrementl(as_Address(dst)); 3328 } else { 3329 lea(rscratch1, dst); 3330 incrementl(Address(rscratch1, 0)); 3331 } 3332 } 3333 3334 void MacroAssembler::incrementl(ArrayAddress dst) { 3335 incrementl(as_Address(dst)); 3336 } 3337 3338 void MacroAssembler::incrementl(Register reg, int value) { 3339 if (value == min_jint) {addl(reg, value) ; return; } 3340 if (value < 0) { decrementl(reg, -value); return; } 3341 if (value == 0) { ; return; } 3342 if (value == 1 && UseIncDec) { incl(reg) ; return; } 3343 /* else */ { addl(reg, value) ; return; } 3344 } 3345 3346 void MacroAssembler::incrementl(Address dst, int value) { 3347 if (value == min_jint) {addl(dst, value) ; return; } 3348 if (value < 0) { decrementl(dst, -value); return; } 3349 if (value == 0) { ; return; } 3350 if (value == 1 && UseIncDec) { incl(dst) ; return; } 3351 /* else */ { addl(dst, value) ; return; } 3352 } 3353 3354 void MacroAssembler::jump(AddressLiteral dst) { 3355 if (reachable(dst)) { 3356 jmp_literal(dst.target(), dst.rspec()); 3357 } else { 3358 lea(rscratch1, dst); 3359 jmp(rscratch1); 3360 } 3361 } 3362 3363 void MacroAssembler::jump_cc(Condition cc, AddressLiteral dst) { 3364 if (reachable(dst)) { 3365 InstructionMark im(this); 3366 relocate(dst.reloc()); 3367 const int short_size = 2; 3368 const int long_size = 6; 3369 int offs = (intptr_t)dst.target() - ((intptr_t)pc()); 3370 if (dst.reloc() == relocInfo::none && is8bit(offs - short_size)) { 3371 // 0111 tttn #8-bit disp 3372 emit_int8(0x70 | cc); 3373 emit_int8((offs - short_size) & 0xFF); 3374 } else { 3375 // 0000 1111 1000 tttn #32-bit disp 3376 emit_int8(0x0F); 3377 emit_int8((unsigned char)(0x80 | cc)); 3378 emit_int32(offs - long_size); 3379 } 3380 } else { 3381 #ifdef ASSERT 3382 warning("reversing conditional branch"); 3383 #endif /* ASSERT */ 3384 Label skip; 3385 jccb(reverse[cc], skip); 3386 lea(rscratch1, dst); 3387 Assembler::jmp(rscratch1); 3388 bind(skip); 3389 } 3390 } 3391 3392 void MacroAssembler::ldmxcsr(AddressLiteral src) { 3393 if (reachable(src)) { 3394 Assembler::ldmxcsr(as_Address(src)); 3395 } else { 3396 lea(rscratch1, src); 3397 Assembler::ldmxcsr(Address(rscratch1, 0)); 3398 } 3399 } 3400 3401 int MacroAssembler::load_signed_byte(Register dst, Address src) { 3402 int off; 3403 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3404 off = offset(); 3405 movsbl(dst, src); // movsxb 3406 } else { 3407 off = load_unsigned_byte(dst, src); 3408 shll(dst, 24); 3409 sarl(dst, 24); 3410 } 3411 return off; 3412 } 3413 3414 // Note: load_signed_short used to be called load_signed_word. 3415 // Although the 'w' in x86 opcodes refers to the term "word" in the assembler 3416 // manual, which means 16 bits, that usage is found nowhere in HotSpot code. 3417 // The term "word" in HotSpot means a 32- or 64-bit machine word. 3418 int MacroAssembler::load_signed_short(Register dst, Address src) { 3419 int off; 3420 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3421 // This is dubious to me since it seems safe to do a signed 16 => 64 bit 3422 // version but this is what 64bit has always done. This seems to imply 3423 // that users are only using 32bits worth. 3424 off = offset(); 3425 movswl(dst, src); // movsxw 3426 } else { 3427 off = load_unsigned_short(dst, src); 3428 shll(dst, 16); 3429 sarl(dst, 16); 3430 } 3431 return off; 3432 } 3433 3434 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 3435 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 3436 // and "3.9 Partial Register Penalties", p. 22). 3437 int off; 3438 if (LP64_ONLY(true || ) VM_Version::is_P6() || src.uses(dst)) { 3439 off = offset(); 3440 movzbl(dst, src); // movzxb 3441 } else { 3442 xorl(dst, dst); 3443 off = offset(); 3444 movb(dst, src); 3445 } 3446 return off; 3447 } 3448 3449 // Note: load_unsigned_short used to be called load_unsigned_word. 3450 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 3451 // According to Intel Doc. AP-526, "Zero-Extension of Short", p.16, 3452 // and "3.9 Partial Register Penalties", p. 22). 3453 int off; 3454 if (LP64_ONLY(true ||) VM_Version::is_P6() || src.uses(dst)) { 3455 off = offset(); 3456 movzwl(dst, src); // movzxw 3457 } else { 3458 xorl(dst, dst); 3459 off = offset(); 3460 movw(dst, src); 3461 } 3462 return off; 3463 } 3464 3465 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 3466 switch (size_in_bytes) { 3467 #ifndef _LP64 3468 case 8: 3469 assert(dst2 != noreg, "second dest register required"); 3470 movl(dst, src); 3471 movl(dst2, src.plus_disp(BytesPerInt)); 3472 break; 3473 #else 3474 case 8: movq(dst, src); break; 3475 #endif 3476 case 4: movl(dst, src); break; 3477 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 3478 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 3479 default: ShouldNotReachHere(); 3480 } 3481 } 3482 3483 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 3484 switch (size_in_bytes) { 3485 #ifndef _LP64 3486 case 8: 3487 assert(src2 != noreg, "second source register required"); 3488 movl(dst, src); 3489 movl(dst.plus_disp(BytesPerInt), src2); 3490 break; 3491 #else 3492 case 8: movq(dst, src); break; 3493 #endif 3494 case 4: movl(dst, src); break; 3495 case 2: movw(dst, src); break; 3496 case 1: movb(dst, src); break; 3497 default: ShouldNotReachHere(); 3498 } 3499 } 3500 3501 void MacroAssembler::mov32(AddressLiteral dst, Register src) { 3502 if (reachable(dst)) { 3503 movl(as_Address(dst), src); 3504 } else { 3505 lea(rscratch1, dst); 3506 movl(Address(rscratch1, 0), src); 3507 } 3508 } 3509 3510 void MacroAssembler::mov32(Register dst, AddressLiteral src) { 3511 if (reachable(src)) { 3512 movl(dst, as_Address(src)); 3513 } else { 3514 lea(rscratch1, src); 3515 movl(dst, Address(rscratch1, 0)); 3516 } 3517 } 3518 3519 // C++ bool manipulation 3520 3521 void MacroAssembler::movbool(Register dst, Address src) { 3522 if(sizeof(bool) == 1) 3523 movb(dst, src); 3524 else if(sizeof(bool) == 2) 3525 movw(dst, src); 3526 else if(sizeof(bool) == 4) 3527 movl(dst, src); 3528 else 3529 // unsupported 3530 ShouldNotReachHere(); 3531 } 3532 3533 void MacroAssembler::movbool(Address dst, bool boolconst) { 3534 if(sizeof(bool) == 1) 3535 movb(dst, (int) boolconst); 3536 else if(sizeof(bool) == 2) 3537 movw(dst, (int) boolconst); 3538 else if(sizeof(bool) == 4) 3539 movl(dst, (int) boolconst); 3540 else 3541 // unsupported 3542 ShouldNotReachHere(); 3543 } 3544 3545 void MacroAssembler::movbool(Address dst, Register src) { 3546 if(sizeof(bool) == 1) 3547 movb(dst, src); 3548 else if(sizeof(bool) == 2) 3549 movw(dst, src); 3550 else if(sizeof(bool) == 4) 3551 movl(dst, src); 3552 else 3553 // unsupported 3554 ShouldNotReachHere(); 3555 } 3556 3557 void MacroAssembler::movbyte(ArrayAddress dst, int src) { 3558 movb(as_Address(dst), src); 3559 } 3560 3561 void MacroAssembler::movdl(XMMRegister dst, AddressLiteral src) { 3562 if (reachable(src)) { 3563 movdl(dst, as_Address(src)); 3564 } else { 3565 lea(rscratch1, src); 3566 movdl(dst, Address(rscratch1, 0)); 3567 } 3568 } 3569 3570 void MacroAssembler::movq(XMMRegister dst, AddressLiteral src) { 3571 if (reachable(src)) { 3572 movq(dst, as_Address(src)); 3573 } else { 3574 lea(rscratch1, src); 3575 movq(dst, Address(rscratch1, 0)); 3576 } 3577 } 3578 3579 void MacroAssembler::movdbl(XMMRegister dst, AddressLiteral src) { 3580 if (reachable(src)) { 3581 if (UseXmmLoadAndClearUpper) { 3582 movsd (dst, as_Address(src)); 3583 } else { 3584 movlpd(dst, as_Address(src)); 3585 } 3586 } else { 3587 lea(rscratch1, src); 3588 if (UseXmmLoadAndClearUpper) { 3589 movsd (dst, Address(rscratch1, 0)); 3590 } else { 3591 movlpd(dst, Address(rscratch1, 0)); 3592 } 3593 } 3594 } 3595 3596 void MacroAssembler::movflt(XMMRegister dst, AddressLiteral src) { 3597 if (reachable(src)) { 3598 movss(dst, as_Address(src)); 3599 } else { 3600 lea(rscratch1, src); 3601 movss(dst, Address(rscratch1, 0)); 3602 } 3603 } 3604 3605 void MacroAssembler::movptr(Register dst, Register src) { 3606 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 3607 } 3608 3609 void MacroAssembler::movptr(Register dst, Address src) { 3610 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 3611 } 3612 3613 // src should NEVER be a real pointer. Use AddressLiteral for true pointers 3614 void MacroAssembler::movptr(Register dst, intptr_t src) { 3615 LP64_ONLY(mov64(dst, src)) NOT_LP64(movl(dst, src)); 3616 } 3617 3618 void MacroAssembler::movptr(Address dst, Register src) { 3619 LP64_ONLY(movq(dst, src)) NOT_LP64(movl(dst, src)); 3620 } 3621 3622 void MacroAssembler::movdqu(XMMRegister dst, AddressLiteral src) { 3623 if (reachable(src)) { 3624 Assembler::movdqu(dst, as_Address(src)); 3625 } else { 3626 lea(rscratch1, src); 3627 Assembler::movdqu(dst, Address(rscratch1, 0)); 3628 } 3629 } 3630 3631 void MacroAssembler::movdqa(XMMRegister dst, AddressLiteral src) { 3632 if (reachable(src)) { 3633 Assembler::movdqa(dst, as_Address(src)); 3634 } else { 3635 lea(rscratch1, src); 3636 Assembler::movdqa(dst, Address(rscratch1, 0)); 3637 } 3638 } 3639 3640 void MacroAssembler::movsd(XMMRegister dst, AddressLiteral src) { 3641 if (reachable(src)) { 3642 Assembler::movsd(dst, as_Address(src)); 3643 } else { 3644 lea(rscratch1, src); 3645 Assembler::movsd(dst, Address(rscratch1, 0)); 3646 } 3647 } 3648 3649 void MacroAssembler::movss(XMMRegister dst, AddressLiteral src) { 3650 if (reachable(src)) { 3651 Assembler::movss(dst, as_Address(src)); 3652 } else { 3653 lea(rscratch1, src); 3654 Assembler::movss(dst, Address(rscratch1, 0)); 3655 } 3656 } 3657 3658 void MacroAssembler::mulsd(XMMRegister dst, AddressLiteral src) { 3659 if (reachable(src)) { 3660 Assembler::mulsd(dst, as_Address(src)); 3661 } else { 3662 lea(rscratch1, src); 3663 Assembler::mulsd(dst, Address(rscratch1, 0)); 3664 } 3665 } 3666 3667 void MacroAssembler::mulss(XMMRegister dst, AddressLiteral src) { 3668 if (reachable(src)) { 3669 Assembler::mulss(dst, as_Address(src)); 3670 } else { 3671 lea(rscratch1, src); 3672 Assembler::mulss(dst, Address(rscratch1, 0)); 3673 } 3674 } 3675 3676 void MacroAssembler::null_check(Register reg, int offset) { 3677 if (needs_explicit_null_check(offset)) { 3678 // provoke OS NULL exception if reg = NULL by 3679 // accessing M[reg] w/o changing any (non-CC) registers 3680 // NOTE: cmpl is plenty here to provoke a segv 3681 cmpptr(rax, Address(reg, 0)); 3682 // Note: should probably use testl(rax, Address(reg, 0)); 3683 // may be shorter code (however, this version of 3684 // testl needs to be implemented first) 3685 } else { 3686 // nothing to do, (later) access of M[reg + offset] 3687 // will provoke OS NULL exception if reg = NULL 3688 } 3689 } 3690 3691 void MacroAssembler::os_breakpoint() { 3692 // instead of directly emitting a breakpoint, call os:breakpoint for better debugability 3693 // (e.g., MSVC can't call ps() otherwise) 3694 call(RuntimeAddress(CAST_FROM_FN_PTR(address, os::breakpoint))); 3695 } 3696 3697 void MacroAssembler::pop_CPU_state() { 3698 pop_FPU_state(); 3699 pop_IU_state(); 3700 } 3701 3702 void MacroAssembler::pop_FPU_state() { 3703 NOT_LP64(frstor(Address(rsp, 0));) 3704 LP64_ONLY(fxrstor(Address(rsp, 0));) 3705 addptr(rsp, FPUStateSizeInWords * wordSize); 3706 } 3707 3708 void MacroAssembler::pop_IU_state() { 3709 popa(); 3710 LP64_ONLY(addq(rsp, 8)); 3711 popf(); 3712 } 3713 3714 // Save Integer and Float state 3715 // Warning: Stack must be 16 byte aligned (64bit) 3716 void MacroAssembler::push_CPU_state() { 3717 push_IU_state(); 3718 push_FPU_state(); 3719 } 3720 3721 void MacroAssembler::push_FPU_state() { 3722 subptr(rsp, FPUStateSizeInWords * wordSize); 3723 #ifndef _LP64 3724 fnsave(Address(rsp, 0)); 3725 fwait(); 3726 #else 3727 fxsave(Address(rsp, 0)); 3728 #endif // LP64 3729 } 3730 3731 void MacroAssembler::push_IU_state() { 3732 // Push flags first because pusha kills them 3733 pushf(); 3734 // Make sure rsp stays 16-byte aligned 3735 LP64_ONLY(subq(rsp, 8)); 3736 pusha(); 3737 } 3738 3739 void MacroAssembler::reset_last_Java_frame(Register java_thread, bool clear_fp, bool clear_pc) { 3740 // determine java_thread register 3741 if (!java_thread->is_valid()) { 3742 java_thread = rdi; 3743 get_thread(java_thread); 3744 } 3745 // we must set sp to zero to clear frame 3746 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), NULL_WORD); 3747 if (clear_fp) { 3748 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), NULL_WORD); 3749 } 3750 3751 if (clear_pc) 3752 movptr(Address(java_thread, JavaThread::last_Java_pc_offset()), NULL_WORD); 3753 3754 } 3755 3756 void MacroAssembler::restore_rax(Register tmp) { 3757 if (tmp == noreg) pop(rax); 3758 else if (tmp != rax) mov(rax, tmp); 3759 } 3760 3761 void MacroAssembler::round_to(Register reg, int modulus) { 3762 addptr(reg, modulus - 1); 3763 andptr(reg, -modulus); 3764 } 3765 3766 void MacroAssembler::save_rax(Register tmp) { 3767 if (tmp == noreg) push(rax); 3768 else if (tmp != rax) mov(tmp, rax); 3769 } 3770 3771 // Write serialization page so VM thread can do a pseudo remote membar. 3772 // We use the current thread pointer to calculate a thread specific 3773 // offset to write to within the page. This minimizes bus traffic 3774 // due to cache line collision. 3775 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 3776 movl(tmp, thread); 3777 shrl(tmp, os::get_serialize_page_shift_count()); 3778 andl(tmp, (os::vm_page_size() - sizeof(int))); 3779 3780 Address index(noreg, tmp, Address::times_1); 3781 ExternalAddress page(os::get_memory_serialize_page()); 3782 3783 // Size of store must match masking code above 3784 movl(as_Address(ArrayAddress(page, index)), tmp); 3785 } 3786 3787 // Calls to C land 3788 // 3789 // When entering C land, the rbp, & rsp of the last Java frame have to be recorded 3790 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 3791 // has to be reset to 0. This is required to allow proper stack traversal. 3792 void MacroAssembler::set_last_Java_frame(Register java_thread, 3793 Register last_java_sp, 3794 Register last_java_fp, 3795 address last_java_pc) { 3796 // determine java_thread register 3797 if (!java_thread->is_valid()) { 3798 java_thread = rdi; 3799 get_thread(java_thread); 3800 } 3801 // determine last_java_sp register 3802 if (!last_java_sp->is_valid()) { 3803 last_java_sp = rsp; 3804 } 3805 3806 // last_java_fp is optional 3807 3808 if (last_java_fp->is_valid()) { 3809 movptr(Address(java_thread, JavaThread::last_Java_fp_offset()), last_java_fp); 3810 } 3811 3812 // last_java_pc is optional 3813 3814 if (last_java_pc != NULL) { 3815 lea(Address(java_thread, 3816 JavaThread::frame_anchor_offset() + JavaFrameAnchor::last_Java_pc_offset()), 3817 InternalAddress(last_java_pc)); 3818 3819 } 3820 movptr(Address(java_thread, JavaThread::last_Java_sp_offset()), last_java_sp); 3821 } 3822 3823 void MacroAssembler::shlptr(Register dst, int imm8) { 3824 LP64_ONLY(shlq(dst, imm8)) NOT_LP64(shll(dst, imm8)); 3825 } 3826 3827 void MacroAssembler::shrptr(Register dst, int imm8) { 3828 LP64_ONLY(shrq(dst, imm8)) NOT_LP64(shrl(dst, imm8)); 3829 } 3830 3831 void MacroAssembler::sign_extend_byte(Register reg) { 3832 if (LP64_ONLY(true ||) (VM_Version::is_P6() && reg->has_byte_register())) { 3833 movsbl(reg, reg); // movsxb 3834 } else { 3835 shll(reg, 24); 3836 sarl(reg, 24); 3837 } 3838 } 3839 3840 void MacroAssembler::sign_extend_short(Register reg) { 3841 if (LP64_ONLY(true ||) VM_Version::is_P6()) { 3842 movswl(reg, reg); // movsxw 3843 } else { 3844 shll(reg, 16); 3845 sarl(reg, 16); 3846 } 3847 } 3848 3849 void MacroAssembler::testl(Register dst, AddressLiteral src) { 3850 assert(reachable(src), "Address should be reachable"); 3851 testl(dst, as_Address(src)); 3852 } 3853 3854 void MacroAssembler::sqrtsd(XMMRegister dst, AddressLiteral src) { 3855 if (reachable(src)) { 3856 Assembler::sqrtsd(dst, as_Address(src)); 3857 } else { 3858 lea(rscratch1, src); 3859 Assembler::sqrtsd(dst, Address(rscratch1, 0)); 3860 } 3861 } 3862 3863 void MacroAssembler::sqrtss(XMMRegister dst, AddressLiteral src) { 3864 if (reachable(src)) { 3865 Assembler::sqrtss(dst, as_Address(src)); 3866 } else { 3867 lea(rscratch1, src); 3868 Assembler::sqrtss(dst, Address(rscratch1, 0)); 3869 } 3870 } 3871 3872 void MacroAssembler::subsd(XMMRegister dst, AddressLiteral src) { 3873 if (reachable(src)) { 3874 Assembler::subsd(dst, as_Address(src)); 3875 } else { 3876 lea(rscratch1, src); 3877 Assembler::subsd(dst, Address(rscratch1, 0)); 3878 } 3879 } 3880 3881 void MacroAssembler::subss(XMMRegister dst, AddressLiteral src) { 3882 if (reachable(src)) { 3883 Assembler::subss(dst, as_Address(src)); 3884 } else { 3885 lea(rscratch1, src); 3886 Assembler::subss(dst, Address(rscratch1, 0)); 3887 } 3888 } 3889 3890 void MacroAssembler::ucomisd(XMMRegister dst, AddressLiteral src) { 3891 if (reachable(src)) { 3892 Assembler::ucomisd(dst, as_Address(src)); 3893 } else { 3894 lea(rscratch1, src); 3895 Assembler::ucomisd(dst, Address(rscratch1, 0)); 3896 } 3897 } 3898 3899 void MacroAssembler::ucomiss(XMMRegister dst, AddressLiteral src) { 3900 if (reachable(src)) { 3901 Assembler::ucomiss(dst, as_Address(src)); 3902 } else { 3903 lea(rscratch1, src); 3904 Assembler::ucomiss(dst, Address(rscratch1, 0)); 3905 } 3906 } 3907 3908 void MacroAssembler::xorpd(XMMRegister dst, AddressLiteral src) { 3909 // Used in sign-bit flipping with aligned address. 3910 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3911 if (reachable(src)) { 3912 Assembler::xorpd(dst, as_Address(src)); 3913 } else { 3914 lea(rscratch1, src); 3915 Assembler::xorpd(dst, Address(rscratch1, 0)); 3916 } 3917 } 3918 3919 void MacroAssembler::xorps(XMMRegister dst, AddressLiteral src) { 3920 // Used in sign-bit flipping with aligned address. 3921 assert((UseAVX > 0) || (((intptr_t)src.target() & 15) == 0), "SSE mode requires address alignment 16 bytes"); 3922 if (reachable(src)) { 3923 Assembler::xorps(dst, as_Address(src)); 3924 } else { 3925 lea(rscratch1, src); 3926 Assembler::xorps(dst, Address(rscratch1, 0)); 3927 } 3928 } 3929 3930 void MacroAssembler::pshufb(XMMRegister dst, AddressLiteral src) { 3931 // Used in sign-bit flipping with aligned address. 3932 bool aligned_adr = (((intptr_t)src.target() & 15) == 0); 3933 assert((UseAVX > 0) || aligned_adr, "SSE mode requires address alignment 16 bytes"); 3934 if (reachable(src)) { 3935 Assembler::pshufb(dst, as_Address(src)); 3936 } else { 3937 lea(rscratch1, src); 3938 Assembler::pshufb(dst, Address(rscratch1, 0)); 3939 } 3940 } 3941 3942 // AVX 3-operands instructions 3943 3944 void MacroAssembler::vaddsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3945 if (reachable(src)) { 3946 vaddsd(dst, nds, as_Address(src)); 3947 } else { 3948 lea(rscratch1, src); 3949 vaddsd(dst, nds, Address(rscratch1, 0)); 3950 } 3951 } 3952 3953 void MacroAssembler::vaddss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3954 if (reachable(src)) { 3955 vaddss(dst, nds, as_Address(src)); 3956 } else { 3957 lea(rscratch1, src); 3958 vaddss(dst, nds, Address(rscratch1, 0)); 3959 } 3960 } 3961 3962 void MacroAssembler::vandpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { 3963 if (reachable(src)) { 3964 vandpd(dst, nds, as_Address(src), vector_len); 3965 } else { 3966 lea(rscratch1, src); 3967 vandpd(dst, nds, Address(rscratch1, 0), vector_len); 3968 } 3969 } 3970 3971 void MacroAssembler::vandps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { 3972 if (reachable(src)) { 3973 vandps(dst, nds, as_Address(src), vector_len); 3974 } else { 3975 lea(rscratch1, src); 3976 vandps(dst, nds, Address(rscratch1, 0), vector_len); 3977 } 3978 } 3979 3980 void MacroAssembler::vdivsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3981 if (reachable(src)) { 3982 vdivsd(dst, nds, as_Address(src)); 3983 } else { 3984 lea(rscratch1, src); 3985 vdivsd(dst, nds, Address(rscratch1, 0)); 3986 } 3987 } 3988 3989 void MacroAssembler::vdivss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3990 if (reachable(src)) { 3991 vdivss(dst, nds, as_Address(src)); 3992 } else { 3993 lea(rscratch1, src); 3994 vdivss(dst, nds, Address(rscratch1, 0)); 3995 } 3996 } 3997 3998 void MacroAssembler::vmulsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 3999 if (reachable(src)) { 4000 vmulsd(dst, nds, as_Address(src)); 4001 } else { 4002 lea(rscratch1, src); 4003 vmulsd(dst, nds, Address(rscratch1, 0)); 4004 } 4005 } 4006 4007 void MacroAssembler::vmulss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 4008 if (reachable(src)) { 4009 vmulss(dst, nds, as_Address(src)); 4010 } else { 4011 lea(rscratch1, src); 4012 vmulss(dst, nds, Address(rscratch1, 0)); 4013 } 4014 } 4015 4016 void MacroAssembler::vsubsd(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 4017 if (reachable(src)) { 4018 vsubsd(dst, nds, as_Address(src)); 4019 } else { 4020 lea(rscratch1, src); 4021 vsubsd(dst, nds, Address(rscratch1, 0)); 4022 } 4023 } 4024 4025 void MacroAssembler::vsubss(XMMRegister dst, XMMRegister nds, AddressLiteral src) { 4026 if (reachable(src)) { 4027 vsubss(dst, nds, as_Address(src)); 4028 } else { 4029 lea(rscratch1, src); 4030 vsubss(dst, nds, Address(rscratch1, 0)); 4031 } 4032 } 4033 4034 void MacroAssembler::vxorpd(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { 4035 if (reachable(src)) { 4036 vxorpd(dst, nds, as_Address(src), vector_len); 4037 } else { 4038 lea(rscratch1, src); 4039 vxorpd(dst, nds, Address(rscratch1, 0), vector_len); 4040 } 4041 } 4042 4043 void MacroAssembler::vxorps(XMMRegister dst, XMMRegister nds, AddressLiteral src, int vector_len) { 4044 if (reachable(src)) { 4045 vxorps(dst, nds, as_Address(src), vector_len); 4046 } else { 4047 lea(rscratch1, src); 4048 vxorps(dst, nds, Address(rscratch1, 0), vector_len); 4049 } 4050 } 4051 4052 4053 ////////////////////////////////////////////////////////////////////////////////// 4054 #if INCLUDE_ALL_GCS 4055 4056 void MacroAssembler::g1_write_barrier_pre(Register obj, 4057 Register pre_val, 4058 Register thread, 4059 Register tmp, 4060 bool tosca_live, 4061 bool expand_call) { 4062 4063 // If expand_call is true then we expand the call_VM_leaf macro 4064 // directly to skip generating the check by 4065 // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp. 4066 4067 #ifdef _LP64 4068 assert(thread == r15_thread, "must be"); 4069 #endif // _LP64 4070 4071 Label done; 4072 Label runtime; 4073 4074 assert(pre_val != noreg, "check this code"); 4075 4076 if (obj != noreg) { 4077 assert_different_registers(obj, pre_val, tmp); 4078 assert(pre_val != rax, "check this code"); 4079 } 4080 4081 Address in_progress(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 4082 PtrQueue::byte_offset_of_active())); 4083 Address index(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 4084 PtrQueue::byte_offset_of_index())); 4085 Address buffer(thread, in_bytes(JavaThread::satb_mark_queue_offset() + 4086 PtrQueue::byte_offset_of_buf())); 4087 4088 4089 // Is marking active? 4090 if (in_bytes(PtrQueue::byte_width_of_active()) == 4) { 4091 cmpl(in_progress, 0); 4092 } else { 4093 assert(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption"); 4094 cmpb(in_progress, 0); 4095 } 4096 jcc(Assembler::equal, done); 4097 4098 // Do we need to load the previous value? 4099 if (obj != noreg) { 4100 load_heap_oop(pre_val, Address(obj, 0)); 4101 } 4102 4103 // Is the previous value null? 4104 cmpptr(pre_val, (int32_t) NULL_WORD); 4105 jcc(Assembler::equal, done); 4106 4107 // Can we store original value in the thread's buffer? 4108 // Is index == 0? 4109 // (The index field is typed as size_t.) 4110 4111 movptr(tmp, index); // tmp := *index_adr 4112 cmpptr(tmp, 0); // tmp == 0? 4113 jcc(Assembler::equal, runtime); // If yes, goto runtime 4114 4115 subptr(tmp, wordSize); // tmp := tmp - wordSize 4116 movptr(index, tmp); // *index_adr := tmp 4117 addptr(tmp, buffer); // tmp := tmp + *buffer_adr 4118 4119 // Record the previous value 4120 movptr(Address(tmp, 0), pre_val); 4121 jmp(done); 4122 4123 bind(runtime); 4124 // save the live input values 4125 if(tosca_live) push(rax); 4126 4127 if (obj != noreg && obj != rax) 4128 push(obj); 4129 4130 if (pre_val != rax) 4131 push(pre_val); 4132 4133 // Calling the runtime using the regular call_VM_leaf mechanism generates 4134 // code (generated by InterpreterMacroAssember::call_VM_leaf_base) 4135 // that checks that the *(ebp+frame::interpreter_frame_last_sp) == NULL. 4136 // 4137 // If we care generating the pre-barrier without a frame (e.g. in the 4138 // intrinsified Reference.get() routine) then ebp might be pointing to 4139 // the caller frame and so this check will most likely fail at runtime. 4140 // 4141 // Expanding the call directly bypasses the generation of the check. 4142 // So when we do not have have a full interpreter frame on the stack 4143 // expand_call should be passed true. 4144 4145 NOT_LP64( push(thread); ) 4146 4147 if (expand_call) { 4148 LP64_ONLY( assert(pre_val != c_rarg1, "smashed arg"); ) 4149 pass_arg1(this, thread); 4150 pass_arg0(this, pre_val); 4151 MacroAssembler::call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), 2); 4152 } else { 4153 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), pre_val, thread); 4154 } 4155 4156 NOT_LP64( pop(thread); ) 4157 4158 // save the live input values 4159 if (pre_val != rax) 4160 pop(pre_val); 4161 4162 if (obj != noreg && obj != rax) 4163 pop(obj); 4164 4165 if(tosca_live) pop(rax); 4166 4167 bind(done); 4168 } 4169 4170 void MacroAssembler::g1_write_barrier_post(Register store_addr, 4171 Register new_val, 4172 Register thread, 4173 Register tmp, 4174 Register tmp2) { 4175 #ifdef _LP64 4176 assert(thread == r15_thread, "must be"); 4177 #endif // _LP64 4178 4179 Address queue_index(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 4180 PtrQueue::byte_offset_of_index())); 4181 Address buffer(thread, in_bytes(JavaThread::dirty_card_queue_offset() + 4182 PtrQueue::byte_offset_of_buf())); 4183 4184 CardTableModRefBS* ct = 4185 barrier_set_cast<CardTableModRefBS>(Universe::heap()->barrier_set()); 4186 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 4187 4188 Label done; 4189 Label runtime; 4190 4191 // Does store cross heap regions? 4192 4193 movptr(tmp, store_addr); 4194 xorptr(tmp, new_val); 4195 shrptr(tmp, HeapRegion::LogOfHRGrainBytes); 4196 jcc(Assembler::equal, done); 4197 4198 // crosses regions, storing NULL? 4199 4200 cmpptr(new_val, (int32_t) NULL_WORD); 4201 jcc(Assembler::equal, done); 4202 4203 // storing region crossing non-NULL, is card already dirty? 4204 4205 const Register card_addr = tmp; 4206 const Register cardtable = tmp2; 4207 4208 movptr(card_addr, store_addr); 4209 shrptr(card_addr, CardTableModRefBS::card_shift); 4210 // Do not use ExternalAddress to load 'byte_map_base', since 'byte_map_base' is NOT 4211 // a valid address and therefore is not properly handled by the relocation code. 4212 movptr(cardtable, (intptr_t)ct->byte_map_base); 4213 addptr(card_addr, cardtable); 4214 4215 cmpb(Address(card_addr, 0), (int)G1SATBCardTableModRefBS::g1_young_card_val()); 4216 jcc(Assembler::equal, done); 4217 4218 membar(Assembler::Membar_mask_bits(Assembler::StoreLoad)); 4219 cmpb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); 4220 jcc(Assembler::equal, done); 4221 4222 4223 // storing a region crossing, non-NULL oop, card is clean. 4224 // dirty card and log. 4225 4226 movb(Address(card_addr, 0), (int)CardTableModRefBS::dirty_card_val()); 4227 4228 cmpl(queue_index, 0); 4229 jcc(Assembler::equal, runtime); 4230 subl(queue_index, wordSize); 4231 movptr(tmp2, buffer); 4232 #ifdef _LP64 4233 movslq(rscratch1, queue_index); 4234 addq(tmp2, rscratch1); 4235 movq(Address(tmp2, 0), card_addr); 4236 #else 4237 addl(tmp2, queue_index); 4238 movl(Address(tmp2, 0), card_addr); 4239 #endif 4240 jmp(done); 4241 4242 bind(runtime); 4243 // save the live input values 4244 push(store_addr); 4245 push(new_val); 4246 #ifdef _LP64 4247 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, r15_thread); 4248 #else 4249 push(thread); 4250 call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), card_addr, thread); 4251 pop(thread); 4252 #endif 4253 pop(new_val); 4254 pop(store_addr); 4255 4256 bind(done); 4257 } 4258 4259 #endif // INCLUDE_ALL_GCS 4260 ////////////////////////////////////////////////////////////////////////////////// 4261 4262 4263 void MacroAssembler::store_check(Register obj, Address dst) { 4264 store_check(obj); 4265 } 4266 4267 void MacroAssembler::store_check(Register obj) { 4268 // Does a store check for the oop in register obj. The content of 4269 // register obj is destroyed afterwards. 4270 4271 BarrierSet* bs = Universe::heap()->barrier_set(); 4272 assert(bs->kind() == BarrierSet::CardTableModRef, "Wrong barrier set kind"); 4273 4274 CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs); 4275 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 4276 4277 shrptr(obj, CardTableModRefBS::card_shift); 4278 4279 Address card_addr; 4280 4281 // The calculation for byte_map_base is as follows: 4282 // byte_map_base = _byte_map - (uintptr_t(low_bound) >> card_shift); 4283 // So this essentially converts an address to a displacement and it will 4284 // never need to be relocated. On 64bit however the value may be too 4285 // large for a 32bit displacement. 4286 intptr_t disp = (intptr_t) ct->byte_map_base; 4287 if (is_simm32(disp)) { 4288 card_addr = Address(noreg, obj, Address::times_1, disp); 4289 } else { 4290 // By doing it as an ExternalAddress 'disp' could be converted to a rip-relative 4291 // displacement and done in a single instruction given favorable mapping and a 4292 // smarter version of as_Address. However, 'ExternalAddress' generates a relocation 4293 // entry and that entry is not properly handled by the relocation code. 4294 AddressLiteral cardtable((address)ct->byte_map_base, relocInfo::none); 4295 Address index(noreg, obj, Address::times_1); 4296 card_addr = as_Address(ArrayAddress(cardtable, index)); 4297 } 4298 4299 int dirty = CardTableModRefBS::dirty_card_val(); 4300 if (UseCondCardMark) { 4301 Label L_already_dirty; 4302 if (UseConcMarkSweepGC) { 4303 membar(Assembler::StoreLoad); 4304 } 4305 cmpb(card_addr, dirty); 4306 jcc(Assembler::equal, L_already_dirty); 4307 movb(card_addr, dirty); 4308 bind(L_already_dirty); 4309 } else { 4310 movb(card_addr, dirty); 4311 } 4312 } 4313 4314 void MacroAssembler::subptr(Register dst, int32_t imm32) { 4315 LP64_ONLY(subq(dst, imm32)) NOT_LP64(subl(dst, imm32)); 4316 } 4317 4318 // Force generation of a 4 byte immediate value even if it fits into 8bit 4319 void MacroAssembler::subptr_imm32(Register dst, int32_t imm32) { 4320 LP64_ONLY(subq_imm32(dst, imm32)) NOT_LP64(subl_imm32(dst, imm32)); 4321 } 4322 4323 void MacroAssembler::subptr(Register dst, Register src) { 4324 LP64_ONLY(subq(dst, src)) NOT_LP64(subl(dst, src)); 4325 } 4326 4327 // C++ bool manipulation 4328 void MacroAssembler::testbool(Register dst) { 4329 if(sizeof(bool) == 1) 4330 testb(dst, 0xff); 4331 else if(sizeof(bool) == 2) { 4332 // testw implementation needed for two byte bools 4333 ShouldNotReachHere(); 4334 } else if(sizeof(bool) == 4) 4335 testl(dst, dst); 4336 else 4337 // unsupported 4338 ShouldNotReachHere(); 4339 } 4340 4341 void MacroAssembler::testptr(Register dst, Register src) { 4342 LP64_ONLY(testq(dst, src)) NOT_LP64(testl(dst, src)); 4343 } 4344 4345 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 4346 void MacroAssembler::tlab_allocate(Register obj, 4347 Register var_size_in_bytes, 4348 int con_size_in_bytes, 4349 Register t1, 4350 Register t2, 4351 Label& slow_case) { 4352 assert_different_registers(obj, t1, t2); 4353 assert_different_registers(obj, var_size_in_bytes, t1); 4354 Register end = t2; 4355 Register thread = NOT_LP64(t1) LP64_ONLY(r15_thread); 4356 4357 verify_tlab(); 4358 4359 NOT_LP64(get_thread(thread)); 4360 4361 movptr(obj, Address(thread, JavaThread::tlab_top_offset())); 4362 if (var_size_in_bytes == noreg) { 4363 lea(end, Address(obj, con_size_in_bytes)); 4364 } else { 4365 lea(end, Address(obj, var_size_in_bytes, Address::times_1)); 4366 } 4367 cmpptr(end, Address(thread, JavaThread::tlab_end_offset())); 4368 jcc(Assembler::above, slow_case); 4369 4370 // update the tlab top pointer 4371 movptr(Address(thread, JavaThread::tlab_top_offset()), end); 4372 4373 // recover var_size_in_bytes if necessary 4374 if (var_size_in_bytes == end) { 4375 subptr(var_size_in_bytes, obj); 4376 } 4377 verify_tlab(); 4378 } 4379 4380 // Preserves rbx, and rdx. 4381 Register MacroAssembler::tlab_refill(Label& retry, 4382 Label& try_eden, 4383 Label& slow_case) { 4384 Register top = rax; 4385 Register t1 = rcx; 4386 Register t2 = rsi; 4387 Register thread_reg = NOT_LP64(rdi) LP64_ONLY(r15_thread); 4388 assert_different_registers(top, thread_reg, t1, t2, /* preserve: */ rbx, rdx); 4389 Label do_refill, discard_tlab; 4390 4391 if (!Universe::heap()->supports_inline_contig_alloc()) { 4392 // No allocation in the shared eden. 4393 jmp(slow_case); 4394 } 4395 4396 NOT_LP64(get_thread(thread_reg)); 4397 4398 movptr(top, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 4399 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 4400 4401 // calculate amount of free space 4402 subptr(t1, top); 4403 shrptr(t1, LogHeapWordSize); 4404 4405 // Retain tlab and allocate object in shared space if 4406 // the amount free in the tlab is too large to discard. 4407 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset()))); 4408 jcc(Assembler::lessEqual, discard_tlab); 4409 4410 // Retain 4411 // %%% yuck as movptr... 4412 movptr(t2, (int32_t) ThreadLocalAllocBuffer::refill_waste_limit_increment()); 4413 addptr(Address(thread_reg, in_bytes(JavaThread::tlab_refill_waste_limit_offset())), t2); 4414 if (TLABStats) { 4415 // increment number of slow_allocations 4416 addl(Address(thread_reg, in_bytes(JavaThread::tlab_slow_allocations_offset())), 1); 4417 } 4418 jmp(try_eden); 4419 4420 bind(discard_tlab); 4421 if (TLABStats) { 4422 // increment number of refills 4423 addl(Address(thread_reg, in_bytes(JavaThread::tlab_number_of_refills_offset())), 1); 4424 // accumulate wastage -- t1 is amount free in tlab 4425 addl(Address(thread_reg, in_bytes(JavaThread::tlab_fast_refill_waste_offset())), t1); 4426 } 4427 4428 // if tlab is currently allocated (top or end != null) then 4429 // fill [top, end + alignment_reserve) with array object 4430 testptr(top, top); 4431 jcc(Assembler::zero, do_refill); 4432 4433 // set up the mark word 4434 movptr(Address(top, oopDesc::mark_offset_in_bytes()), (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2)); 4435 // set the length to the remaining space 4436 subptr(t1, typeArrayOopDesc::header_size(T_INT)); 4437 addptr(t1, (int32_t)ThreadLocalAllocBuffer::alignment_reserve()); 4438 shlptr(t1, log2_intptr(HeapWordSize/sizeof(jint))); 4439 movl(Address(top, arrayOopDesc::length_offset_in_bytes()), t1); 4440 // set klass to intArrayKlass 4441 // dubious reloc why not an oop reloc? 4442 movptr(t1, ExternalAddress((address)Universe::intArrayKlassObj_addr())); 4443 // store klass last. concurrent gcs assumes klass length is valid if 4444 // klass field is not null. 4445 store_klass(top, t1); 4446 4447 movptr(t1, top); 4448 subptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 4449 incr_allocated_bytes(thread_reg, t1, 0); 4450 4451 // refill the tlab with an eden allocation 4452 bind(do_refill); 4453 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset()))); 4454 shlptr(t1, LogHeapWordSize); 4455 // allocate new tlab, address returned in top 4456 eden_allocate(top, t1, 0, t2, slow_case); 4457 4458 // Check that t1 was preserved in eden_allocate. 4459 #ifdef ASSERT 4460 if (UseTLAB) { 4461 Label ok; 4462 Register tsize = rsi; 4463 assert_different_registers(tsize, thread_reg, t1); 4464 push(tsize); 4465 movptr(tsize, Address(thread_reg, in_bytes(JavaThread::tlab_size_offset()))); 4466 shlptr(tsize, LogHeapWordSize); 4467 cmpptr(t1, tsize); 4468 jcc(Assembler::equal, ok); 4469 STOP("assert(t1 != tlab size)"); 4470 should_not_reach_here(); 4471 4472 bind(ok); 4473 pop(tsize); 4474 } 4475 #endif 4476 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_start_offset())), top); 4477 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_top_offset())), top); 4478 addptr(top, t1); 4479 subptr(top, (int32_t)ThreadLocalAllocBuffer::alignment_reserve_in_bytes()); 4480 movptr(Address(thread_reg, in_bytes(JavaThread::tlab_end_offset())), top); 4481 verify_tlab(); 4482 jmp(retry); 4483 4484 return thread_reg; // for use by caller 4485 } 4486 4487 void MacroAssembler::incr_allocated_bytes(Register thread, 4488 Register var_size_in_bytes, 4489 int con_size_in_bytes, 4490 Register t1) { 4491 if (!thread->is_valid()) { 4492 #ifdef _LP64 4493 thread = r15_thread; 4494 #else 4495 assert(t1->is_valid(), "need temp reg"); 4496 thread = t1; 4497 get_thread(thread); 4498 #endif 4499 } 4500 4501 #ifdef _LP64 4502 if (var_size_in_bytes->is_valid()) { 4503 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes); 4504 } else { 4505 addq(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes); 4506 } 4507 #else 4508 if (var_size_in_bytes->is_valid()) { 4509 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), var_size_in_bytes); 4510 } else { 4511 addl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())), con_size_in_bytes); 4512 } 4513 adcl(Address(thread, in_bytes(JavaThread::allocated_bytes_offset())+4), 0); 4514 #endif 4515 } 4516 4517 void MacroAssembler::fp_runtime_fallback(address runtime_entry, int nb_args, int num_fpu_regs_in_use) { 4518 pusha(); 4519 4520 // if we are coming from c1, xmm registers may be live 4521 int off = 0; 4522 if (UseSSE == 1) { 4523 subptr(rsp, sizeof(jdouble)*8); 4524 movflt(Address(rsp,off++*sizeof(jdouble)),xmm0); 4525 movflt(Address(rsp,off++*sizeof(jdouble)),xmm1); 4526 movflt(Address(rsp,off++*sizeof(jdouble)),xmm2); 4527 movflt(Address(rsp,off++*sizeof(jdouble)),xmm3); 4528 movflt(Address(rsp,off++*sizeof(jdouble)),xmm4); 4529 movflt(Address(rsp,off++*sizeof(jdouble)),xmm5); 4530 movflt(Address(rsp,off++*sizeof(jdouble)),xmm6); 4531 movflt(Address(rsp,off++*sizeof(jdouble)),xmm7); 4532 } else if (UseSSE >= 2) { 4533 if (UseAVX > 2) { 4534 movl(rbx, 0xffff); 4535 #ifdef _LP64 4536 kmovql(k1, rbx); 4537 #else 4538 kmovdl(k1, rbx); 4539 #endif 4540 } 4541 #ifdef COMPILER2 4542 if (MaxVectorSize > 16) { 4543 assert(UseAVX > 0, "256bit vectors are supported only with AVX"); 4544 // Save upper half of YMM registes 4545 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 4546 vextractf128h(Address(rsp, 0),xmm0); 4547 vextractf128h(Address(rsp, 16),xmm1); 4548 vextractf128h(Address(rsp, 32),xmm2); 4549 vextractf128h(Address(rsp, 48),xmm3); 4550 vextractf128h(Address(rsp, 64),xmm4); 4551 vextractf128h(Address(rsp, 80),xmm5); 4552 vextractf128h(Address(rsp, 96),xmm6); 4553 vextractf128h(Address(rsp,112),xmm7); 4554 #ifdef _LP64 4555 vextractf128h(Address(rsp,128),xmm8); 4556 vextractf128h(Address(rsp,144),xmm9); 4557 vextractf128h(Address(rsp,160),xmm10); 4558 vextractf128h(Address(rsp,176),xmm11); 4559 vextractf128h(Address(rsp,192),xmm12); 4560 vextractf128h(Address(rsp,208),xmm13); 4561 vextractf128h(Address(rsp,224),xmm14); 4562 vextractf128h(Address(rsp,240),xmm15); 4563 #endif 4564 } 4565 #endif 4566 // Save whole 128bit (16 bytes) XMM regiters 4567 subptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 4568 movdqu(Address(rsp,off++*16),xmm0); 4569 movdqu(Address(rsp,off++*16),xmm1); 4570 movdqu(Address(rsp,off++*16),xmm2); 4571 movdqu(Address(rsp,off++*16),xmm3); 4572 movdqu(Address(rsp,off++*16),xmm4); 4573 movdqu(Address(rsp,off++*16),xmm5); 4574 movdqu(Address(rsp,off++*16),xmm6); 4575 movdqu(Address(rsp,off++*16),xmm7); 4576 #ifdef _LP64 4577 movdqu(Address(rsp,off++*16),xmm8); 4578 movdqu(Address(rsp,off++*16),xmm9); 4579 movdqu(Address(rsp,off++*16),xmm10); 4580 movdqu(Address(rsp,off++*16),xmm11); 4581 movdqu(Address(rsp,off++*16),xmm12); 4582 movdqu(Address(rsp,off++*16),xmm13); 4583 movdqu(Address(rsp,off++*16),xmm14); 4584 movdqu(Address(rsp,off++*16),xmm15); 4585 #endif 4586 } 4587 4588 // Preserve registers across runtime call 4589 int incoming_argument_and_return_value_offset = -1; 4590 if (num_fpu_regs_in_use > 1) { 4591 // Must preserve all other FPU regs (could alternatively convert 4592 // SharedRuntime::dsin, dcos etc. into assembly routines known not to trash 4593 // FPU state, but can not trust C compiler) 4594 NEEDS_CLEANUP; 4595 // NOTE that in this case we also push the incoming argument(s) to 4596 // the stack and restore it later; we also use this stack slot to 4597 // hold the return value from dsin, dcos etc. 4598 for (int i = 0; i < num_fpu_regs_in_use; i++) { 4599 subptr(rsp, sizeof(jdouble)); 4600 fstp_d(Address(rsp, 0)); 4601 } 4602 incoming_argument_and_return_value_offset = sizeof(jdouble)*(num_fpu_regs_in_use-1); 4603 for (int i = nb_args-1; i >= 0; i--) { 4604 fld_d(Address(rsp, incoming_argument_and_return_value_offset-i*sizeof(jdouble))); 4605 } 4606 } 4607 4608 subptr(rsp, nb_args*sizeof(jdouble)); 4609 for (int i = 0; i < nb_args; i++) { 4610 fstp_d(Address(rsp, i*sizeof(jdouble))); 4611 } 4612 4613 #ifdef _LP64 4614 if (nb_args > 0) { 4615 movdbl(xmm0, Address(rsp, 0)); 4616 } 4617 if (nb_args > 1) { 4618 movdbl(xmm1, Address(rsp, sizeof(jdouble))); 4619 } 4620 assert(nb_args <= 2, "unsupported number of args"); 4621 #endif // _LP64 4622 4623 // NOTE: we must not use call_VM_leaf here because that requires a 4624 // complete interpreter frame in debug mode -- same bug as 4387334 4625 // MacroAssembler::call_VM_leaf_base is perfectly safe and will 4626 // do proper 64bit abi 4627 4628 NEEDS_CLEANUP; 4629 // Need to add stack banging before this runtime call if it needs to 4630 // be taken; however, there is no generic stack banging routine at 4631 // the MacroAssembler level 4632 4633 MacroAssembler::call_VM_leaf_base(runtime_entry, 0); 4634 4635 #ifdef _LP64 4636 movsd(Address(rsp, 0), xmm0); 4637 fld_d(Address(rsp, 0)); 4638 #endif // _LP64 4639 addptr(rsp, sizeof(jdouble) * nb_args); 4640 if (num_fpu_regs_in_use > 1) { 4641 // Must save return value to stack and then restore entire FPU 4642 // stack except incoming arguments 4643 fstp_d(Address(rsp, incoming_argument_and_return_value_offset)); 4644 for (int i = 0; i < num_fpu_regs_in_use - nb_args; i++) { 4645 fld_d(Address(rsp, 0)); 4646 addptr(rsp, sizeof(jdouble)); 4647 } 4648 fld_d(Address(rsp, (nb_args-1)*sizeof(jdouble))); 4649 addptr(rsp, sizeof(jdouble) * nb_args); 4650 } 4651 4652 off = 0; 4653 if (UseSSE == 1) { 4654 movflt(xmm0, Address(rsp,off++*sizeof(jdouble))); 4655 movflt(xmm1, Address(rsp,off++*sizeof(jdouble))); 4656 movflt(xmm2, Address(rsp,off++*sizeof(jdouble))); 4657 movflt(xmm3, Address(rsp,off++*sizeof(jdouble))); 4658 movflt(xmm4, Address(rsp,off++*sizeof(jdouble))); 4659 movflt(xmm5, Address(rsp,off++*sizeof(jdouble))); 4660 movflt(xmm6, Address(rsp,off++*sizeof(jdouble))); 4661 movflt(xmm7, Address(rsp,off++*sizeof(jdouble))); 4662 addptr(rsp, sizeof(jdouble)*8); 4663 } else if (UseSSE >= 2) { 4664 // Restore whole 128bit (16 bytes) XMM regiters 4665 movdqu(xmm0, Address(rsp,off++*16)); 4666 movdqu(xmm1, Address(rsp,off++*16)); 4667 movdqu(xmm2, Address(rsp,off++*16)); 4668 movdqu(xmm3, Address(rsp,off++*16)); 4669 movdqu(xmm4, Address(rsp,off++*16)); 4670 movdqu(xmm5, Address(rsp,off++*16)); 4671 movdqu(xmm6, Address(rsp,off++*16)); 4672 movdqu(xmm7, Address(rsp,off++*16)); 4673 #ifdef _LP64 4674 movdqu(xmm8, Address(rsp,off++*16)); 4675 movdqu(xmm9, Address(rsp,off++*16)); 4676 movdqu(xmm10, Address(rsp,off++*16)); 4677 movdqu(xmm11, Address(rsp,off++*16)); 4678 movdqu(xmm12, Address(rsp,off++*16)); 4679 movdqu(xmm13, Address(rsp,off++*16)); 4680 movdqu(xmm14, Address(rsp,off++*16)); 4681 movdqu(xmm15, Address(rsp,off++*16)); 4682 #endif 4683 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 4684 #ifdef COMPILER2 4685 if (MaxVectorSize > 16) { 4686 // Restore upper half of YMM registes. 4687 vinsertf128h(xmm0, Address(rsp, 0)); 4688 vinsertf128h(xmm1, Address(rsp, 16)); 4689 vinsertf128h(xmm2, Address(rsp, 32)); 4690 vinsertf128h(xmm3, Address(rsp, 48)); 4691 vinsertf128h(xmm4, Address(rsp, 64)); 4692 vinsertf128h(xmm5, Address(rsp, 80)); 4693 vinsertf128h(xmm6, Address(rsp, 96)); 4694 vinsertf128h(xmm7, Address(rsp,112)); 4695 #ifdef _LP64 4696 vinsertf128h(xmm8, Address(rsp,128)); 4697 vinsertf128h(xmm9, Address(rsp,144)); 4698 vinsertf128h(xmm10, Address(rsp,160)); 4699 vinsertf128h(xmm11, Address(rsp,176)); 4700 vinsertf128h(xmm12, Address(rsp,192)); 4701 vinsertf128h(xmm13, Address(rsp,208)); 4702 vinsertf128h(xmm14, Address(rsp,224)); 4703 vinsertf128h(xmm15, Address(rsp,240)); 4704 #endif 4705 addptr(rsp, 16 * LP64_ONLY(16) NOT_LP64(8)); 4706 } 4707 #endif 4708 } 4709 popa(); 4710 } 4711 4712 static const double pi_4 = 0.7853981633974483; 4713 4714 void MacroAssembler::trigfunc(char trig, int num_fpu_regs_in_use) { 4715 // A hand-coded argument reduction for values in fabs(pi/4, pi/2) 4716 // was attempted in this code; unfortunately it appears that the 4717 // switch to 80-bit precision and back causes this to be 4718 // unprofitable compared with simply performing a runtime call if 4719 // the argument is out of the (-pi/4, pi/4) range. 4720 4721 Register tmp = noreg; 4722 if (!VM_Version::supports_cmov()) { 4723 // fcmp needs a temporary so preserve rbx, 4724 tmp = rbx; 4725 push(tmp); 4726 } 4727 4728 Label slow_case, done; 4729 4730 ExternalAddress pi4_adr = (address)&pi_4; 4731 if (reachable(pi4_adr)) { 4732 // x ?<= pi/4 4733 fld_d(pi4_adr); 4734 fld_s(1); // Stack: X PI/4 X 4735 fabs(); // Stack: |X| PI/4 X 4736 fcmp(tmp); 4737 jcc(Assembler::above, slow_case); 4738 4739 // fastest case: -pi/4 <= x <= pi/4 4740 switch(trig) { 4741 case 's': 4742 fsin(); 4743 break; 4744 case 'c': 4745 fcos(); 4746 break; 4747 case 't': 4748 ftan(); 4749 break; 4750 default: 4751 assert(false, "bad intrinsic"); 4752 break; 4753 } 4754 jmp(done); 4755 } 4756 4757 // slow case: runtime call 4758 bind(slow_case); 4759 4760 switch(trig) { 4761 case 's': 4762 { 4763 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dsin), 1, num_fpu_regs_in_use); 4764 } 4765 break; 4766 case 'c': 4767 { 4768 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dcos), 1, num_fpu_regs_in_use); 4769 } 4770 break; 4771 case 't': 4772 { 4773 fp_runtime_fallback(CAST_FROM_FN_PTR(address, SharedRuntime::dtan), 1, num_fpu_regs_in_use); 4774 } 4775 break; 4776 default: 4777 assert(false, "bad intrinsic"); 4778 break; 4779 } 4780 4781 // Come here with result in F-TOS 4782 bind(done); 4783 4784 if (tmp != noreg) { 4785 pop(tmp); 4786 } 4787 } 4788 4789 4790 // Look up the method for a megamorphic invokeinterface call. 4791 // The target method is determined by <intf_klass, itable_index>. 4792 // The receiver klass is in recv_klass. 4793 // On success, the result will be in method_result, and execution falls through. 4794 // On failure, execution transfers to the given label. 4795 void MacroAssembler::lookup_interface_method(Register recv_klass, 4796 Register intf_klass, 4797 RegisterOrConstant itable_index, 4798 Register method_result, 4799 Register scan_temp, 4800 Label& L_no_such_interface) { 4801 assert_different_registers(recv_klass, intf_klass, method_result, scan_temp); 4802 assert(itable_index.is_constant() || itable_index.as_register() == method_result, 4803 "caller must use same register for non-constant itable index as for method"); 4804 4805 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 4806 int vtable_base = InstanceKlass::vtable_start_offset() * wordSize; 4807 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 4808 int scan_step = itableOffsetEntry::size() * wordSize; 4809 int vte_size = vtableEntry::size() * wordSize; 4810 Address::ScaleFactor times_vte_scale = Address::times_ptr; 4811 assert(vte_size == wordSize, "else adjust times_vte_scale"); 4812 4813 movl(scan_temp, Address(recv_klass, InstanceKlass::vtable_length_offset() * wordSize)); 4814 4815 // %%% Could store the aligned, prescaled offset in the klassoop. 4816 lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 4817 if (HeapWordsPerLong > 1) { 4818 // Round up to align_object_offset boundary 4819 // see code for InstanceKlass::start_of_itable! 4820 round_to(scan_temp, BytesPerLong); 4821 } 4822 4823 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 4824 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 4825 lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 4826 4827 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 4828 // if (scan->interface() == intf) { 4829 // result = (klass + scan->offset() + itable_index); 4830 // } 4831 // } 4832 Label search, found_method; 4833 4834 for (int peel = 1; peel >= 0; peel--) { 4835 movptr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 4836 cmpptr(intf_klass, method_result); 4837 4838 if (peel) { 4839 jccb(Assembler::equal, found_method); 4840 } else { 4841 jccb(Assembler::notEqual, search); 4842 // (invert the test to fall through to found_method...) 4843 } 4844 4845 if (!peel) break; 4846 4847 bind(search); 4848 4849 // Check that the previous entry is non-null. A null entry means that 4850 // the receiver class doesn't implement the interface, and wasn't the 4851 // same as when the caller was compiled. 4852 testptr(method_result, method_result); 4853 jcc(Assembler::zero, L_no_such_interface); 4854 addptr(scan_temp, scan_step); 4855 } 4856 4857 bind(found_method); 4858 4859 // Got a hit. 4860 movl(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 4861 movptr(method_result, Address(recv_klass, scan_temp, Address::times_1)); 4862 } 4863 4864 4865 // virtual method calling 4866 void MacroAssembler::lookup_virtual_method(Register recv_klass, 4867 RegisterOrConstant vtable_index, 4868 Register method_result) { 4869 const int base = InstanceKlass::vtable_start_offset() * wordSize; 4870 assert(vtableEntry::size() * wordSize == wordSize, "else adjust the scaling in the code below"); 4871 Address vtable_entry_addr(recv_klass, 4872 vtable_index, Address::times_ptr, 4873 base + vtableEntry::method_offset_in_bytes()); 4874 movptr(method_result, vtable_entry_addr); 4875 } 4876 4877 4878 void MacroAssembler::check_klass_subtype(Register sub_klass, 4879 Register super_klass, 4880 Register temp_reg, 4881 Label& L_success) { 4882 Label L_failure; 4883 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 4884 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 4885 bind(L_failure); 4886 } 4887 4888 4889 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 4890 Register super_klass, 4891 Register temp_reg, 4892 Label* L_success, 4893 Label* L_failure, 4894 Label* L_slow_path, 4895 RegisterOrConstant super_check_offset) { 4896 assert_different_registers(sub_klass, super_klass, temp_reg); 4897 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 4898 if (super_check_offset.is_register()) { 4899 assert_different_registers(sub_klass, super_klass, 4900 super_check_offset.as_register()); 4901 } else if (must_load_sco) { 4902 assert(temp_reg != noreg, "supply either a temp or a register offset"); 4903 } 4904 4905 Label L_fallthrough; 4906 int label_nulls = 0; 4907 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 4908 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 4909 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 4910 assert(label_nulls <= 1, "at most one NULL in the batch"); 4911 4912 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 4913 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 4914 Address super_check_offset_addr(super_klass, sco_offset); 4915 4916 // Hacked jcc, which "knows" that L_fallthrough, at least, is in 4917 // range of a jccb. If this routine grows larger, reconsider at 4918 // least some of these. 4919 #define local_jcc(assembler_cond, label) \ 4920 if (&(label) == &L_fallthrough) jccb(assembler_cond, label); \ 4921 else jcc( assembler_cond, label) /*omit semi*/ 4922 4923 // Hacked jmp, which may only be used just before L_fallthrough. 4924 #define final_jmp(label) \ 4925 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 4926 else jmp(label) /*omit semi*/ 4927 4928 // If the pointers are equal, we are done (e.g., String[] elements). 4929 // This self-check enables sharing of secondary supertype arrays among 4930 // non-primary types such as array-of-interface. Otherwise, each such 4931 // type would need its own customized SSA. 4932 // We move this check to the front of the fast path because many 4933 // type checks are in fact trivially successful in this manner, 4934 // so we get a nicely predicted branch right at the start of the check. 4935 cmpptr(sub_klass, super_klass); 4936 local_jcc(Assembler::equal, *L_success); 4937 4938 // Check the supertype display: 4939 if (must_load_sco) { 4940 // Positive movl does right thing on LP64. 4941 movl(temp_reg, super_check_offset_addr); 4942 super_check_offset = RegisterOrConstant(temp_reg); 4943 } 4944 Address super_check_addr(sub_klass, super_check_offset, Address::times_1, 0); 4945 cmpptr(super_klass, super_check_addr); // load displayed supertype 4946 4947 // This check has worked decisively for primary supers. 4948 // Secondary supers are sought in the super_cache ('super_cache_addr'). 4949 // (Secondary supers are interfaces and very deeply nested subtypes.) 4950 // This works in the same check above because of a tricky aliasing 4951 // between the super_cache and the primary super display elements. 4952 // (The 'super_check_addr' can address either, as the case requires.) 4953 // Note that the cache is updated below if it does not help us find 4954 // what we need immediately. 4955 // So if it was a primary super, we can just fail immediately. 4956 // Otherwise, it's the slow path for us (no success at this point). 4957 4958 if (super_check_offset.is_register()) { 4959 local_jcc(Assembler::equal, *L_success); 4960 cmpl(super_check_offset.as_register(), sc_offset); 4961 if (L_failure == &L_fallthrough) { 4962 local_jcc(Assembler::equal, *L_slow_path); 4963 } else { 4964 local_jcc(Assembler::notEqual, *L_failure); 4965 final_jmp(*L_slow_path); 4966 } 4967 } else if (super_check_offset.as_constant() == sc_offset) { 4968 // Need a slow path; fast failure is impossible. 4969 if (L_slow_path == &L_fallthrough) { 4970 local_jcc(Assembler::equal, *L_success); 4971 } else { 4972 local_jcc(Assembler::notEqual, *L_slow_path); 4973 final_jmp(*L_success); 4974 } 4975 } else { 4976 // No slow path; it's a fast decision. 4977 if (L_failure == &L_fallthrough) { 4978 local_jcc(Assembler::equal, *L_success); 4979 } else { 4980 local_jcc(Assembler::notEqual, *L_failure); 4981 final_jmp(*L_success); 4982 } 4983 } 4984 4985 bind(L_fallthrough); 4986 4987 #undef local_jcc 4988 #undef final_jmp 4989 } 4990 4991 4992 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 4993 Register super_klass, 4994 Register temp_reg, 4995 Register temp2_reg, 4996 Label* L_success, 4997 Label* L_failure, 4998 bool set_cond_codes) { 4999 assert_different_registers(sub_klass, super_klass, temp_reg); 5000 if (temp2_reg != noreg) 5001 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg); 5002 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 5003 5004 Label L_fallthrough; 5005 int label_nulls = 0; 5006 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 5007 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 5008 assert(label_nulls <= 1, "at most one NULL in the batch"); 5009 5010 // a couple of useful fields in sub_klass: 5011 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 5012 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 5013 Address secondary_supers_addr(sub_klass, ss_offset); 5014 Address super_cache_addr( sub_klass, sc_offset); 5015 5016 // Do a linear scan of the secondary super-klass chain. 5017 // This code is rarely used, so simplicity is a virtue here. 5018 // The repne_scan instruction uses fixed registers, which we must spill. 5019 // Don't worry too much about pre-existing connections with the input regs. 5020 5021 assert(sub_klass != rax, "killed reg"); // killed by mov(rax, super) 5022 assert(sub_klass != rcx, "killed reg"); // killed by lea(rcx, &pst_counter) 5023 5024 // Get super_klass value into rax (even if it was in rdi or rcx). 5025 bool pushed_rax = false, pushed_rcx = false, pushed_rdi = false; 5026 if (super_klass != rax || UseCompressedOops) { 5027 if (!IS_A_TEMP(rax)) { push(rax); pushed_rax = true; } 5028 mov(rax, super_klass); 5029 } 5030 if (!IS_A_TEMP(rcx)) { push(rcx); pushed_rcx = true; } 5031 if (!IS_A_TEMP(rdi)) { push(rdi); pushed_rdi = true; } 5032 5033 #ifndef PRODUCT 5034 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 5035 ExternalAddress pst_counter_addr((address) pst_counter); 5036 NOT_LP64( incrementl(pst_counter_addr) ); 5037 LP64_ONLY( lea(rcx, pst_counter_addr) ); 5038 LP64_ONLY( incrementl(Address(rcx, 0)) ); 5039 #endif //PRODUCT 5040 5041 // We will consult the secondary-super array. 5042 movptr(rdi, secondary_supers_addr); 5043 // Load the array length. (Positive movl does right thing on LP64.) 5044 movl(rcx, Address(rdi, Array<Klass*>::length_offset_in_bytes())); 5045 // Skip to start of data. 5046 addptr(rdi, Array<Klass*>::base_offset_in_bytes()); 5047 5048 // Scan RCX words at [RDI] for an occurrence of RAX. 5049 // Set NZ/Z based on last compare. 5050 // Z flag value will not be set by 'repne' if RCX == 0 since 'repne' does 5051 // not change flags (only scas instruction which is repeated sets flags). 5052 // Set Z = 0 (not equal) before 'repne' to indicate that class was not found. 5053 5054 testptr(rax,rax); // Set Z = 0 5055 repne_scan(); 5056 5057 // Unspill the temp. registers: 5058 if (pushed_rdi) pop(rdi); 5059 if (pushed_rcx) pop(rcx); 5060 if (pushed_rax) pop(rax); 5061 5062 if (set_cond_codes) { 5063 // Special hack for the AD files: rdi is guaranteed non-zero. 5064 assert(!pushed_rdi, "rdi must be left non-NULL"); 5065 // Also, the condition codes are properly set Z/NZ on succeed/failure. 5066 } 5067 5068 if (L_failure == &L_fallthrough) 5069 jccb(Assembler::notEqual, *L_failure); 5070 else jcc(Assembler::notEqual, *L_failure); 5071 5072 // Success. Cache the super we found and proceed in triumph. 5073 movptr(super_cache_addr, super_klass); 5074 5075 if (L_success != &L_fallthrough) { 5076 jmp(*L_success); 5077 } 5078 5079 #undef IS_A_TEMP 5080 5081 bind(L_fallthrough); 5082 } 5083 5084 5085 void MacroAssembler::cmov32(Condition cc, Register dst, Address src) { 5086 if (VM_Version::supports_cmov()) { 5087 cmovl(cc, dst, src); 5088 } else { 5089 Label L; 5090 jccb(negate_condition(cc), L); 5091 movl(dst, src); 5092 bind(L); 5093 } 5094 } 5095 5096 void MacroAssembler::cmov32(Condition cc, Register dst, Register src) { 5097 if (VM_Version::supports_cmov()) { 5098 cmovl(cc, dst, src); 5099 } else { 5100 Label L; 5101 jccb(negate_condition(cc), L); 5102 movl(dst, src); 5103 bind(L); 5104 } 5105 } 5106 5107 void MacroAssembler::verify_oop(Register reg, const char* s) { 5108 if (!VerifyOops) return; 5109 5110 // Pass register number to verify_oop_subroutine 5111 const char* b = NULL; 5112 { 5113 ResourceMark rm; 5114 stringStream ss; 5115 ss.print("verify_oop: %s: %s", reg->name(), s); 5116 b = code_string(ss.as_string()); 5117 } 5118 BLOCK_COMMENT("verify_oop {"); 5119 #ifdef _LP64 5120 push(rscratch1); // save r10, trashed by movptr() 5121 #endif 5122 push(rax); // save rax, 5123 push(reg); // pass register argument 5124 ExternalAddress buffer((address) b); 5125 // avoid using pushptr, as it modifies scratch registers 5126 // and our contract is not to modify anything 5127 movptr(rax, buffer.addr()); 5128 push(rax); 5129 // call indirectly to solve generation ordering problem 5130 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5131 call(rax); 5132 // Caller pops the arguments (oop, message) and restores rax, r10 5133 BLOCK_COMMENT("} verify_oop"); 5134 } 5135 5136 5137 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 5138 Register tmp, 5139 int offset) { 5140 intptr_t value = *delayed_value_addr; 5141 if (value != 0) 5142 return RegisterOrConstant(value + offset); 5143 5144 // load indirectly to solve generation ordering problem 5145 movptr(tmp, ExternalAddress((address) delayed_value_addr)); 5146 5147 #ifdef ASSERT 5148 { Label L; 5149 testptr(tmp, tmp); 5150 if (WizardMode) { 5151 const char* buf = NULL; 5152 { 5153 ResourceMark rm; 5154 stringStream ss; 5155 ss.print("DelayedValue="INTPTR_FORMAT, delayed_value_addr[1]); 5156 buf = code_string(ss.as_string()); 5157 } 5158 jcc(Assembler::notZero, L); 5159 STOP(buf); 5160 } else { 5161 jccb(Assembler::notZero, L); 5162 hlt(); 5163 } 5164 bind(L); 5165 } 5166 #endif 5167 5168 if (offset != 0) 5169 addptr(tmp, offset); 5170 5171 return RegisterOrConstant(tmp); 5172 } 5173 5174 5175 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 5176 int extra_slot_offset) { 5177 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 5178 int stackElementSize = Interpreter::stackElementSize; 5179 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 5180 #ifdef ASSERT 5181 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 5182 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 5183 #endif 5184 Register scale_reg = noreg; 5185 Address::ScaleFactor scale_factor = Address::no_scale; 5186 if (arg_slot.is_constant()) { 5187 offset += arg_slot.as_constant() * stackElementSize; 5188 } else { 5189 scale_reg = arg_slot.as_register(); 5190 scale_factor = Address::times(stackElementSize); 5191 } 5192 offset += wordSize; // return PC is on stack 5193 return Address(rsp, scale_reg, scale_factor, offset); 5194 } 5195 5196 5197 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 5198 if (!VerifyOops) return; 5199 5200 // Address adjust(addr.base(), addr.index(), addr.scale(), addr.disp() + BytesPerWord); 5201 // Pass register number to verify_oop_subroutine 5202 const char* b = NULL; 5203 { 5204 ResourceMark rm; 5205 stringStream ss; 5206 ss.print("verify_oop_addr: %s", s); 5207 b = code_string(ss.as_string()); 5208 } 5209 #ifdef _LP64 5210 push(rscratch1); // save r10, trashed by movptr() 5211 #endif 5212 push(rax); // save rax, 5213 // addr may contain rsp so we will have to adjust it based on the push 5214 // we just did (and on 64 bit we do two pushes) 5215 // NOTE: 64bit seemed to have had a bug in that it did movq(addr, rax); which 5216 // stores rax into addr which is backwards of what was intended. 5217 if (addr.uses(rsp)) { 5218 lea(rax, addr); 5219 pushptr(Address(rax, LP64_ONLY(2 *) BytesPerWord)); 5220 } else { 5221 pushptr(addr); 5222 } 5223 5224 ExternalAddress buffer((address) b); 5225 // pass msg argument 5226 // avoid using pushptr, as it modifies scratch registers 5227 // and our contract is not to modify anything 5228 movptr(rax, buffer.addr()); 5229 push(rax); 5230 5231 // call indirectly to solve generation ordering problem 5232 movptr(rax, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 5233 call(rax); 5234 // Caller pops the arguments (addr, message) and restores rax, r10. 5235 } 5236 5237 void MacroAssembler::verify_tlab() { 5238 #ifdef ASSERT 5239 if (UseTLAB && VerifyOops) { 5240 Label next, ok; 5241 Register t1 = rsi; 5242 Register thread_reg = NOT_LP64(rbx) LP64_ONLY(r15_thread); 5243 5244 push(t1); 5245 NOT_LP64(push(thread_reg)); 5246 NOT_LP64(get_thread(thread_reg)); 5247 5248 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5249 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_start_offset()))); 5250 jcc(Assembler::aboveEqual, next); 5251 STOP("assert(top >= start)"); 5252 should_not_reach_here(); 5253 5254 bind(next); 5255 movptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_end_offset()))); 5256 cmpptr(t1, Address(thread_reg, in_bytes(JavaThread::tlab_top_offset()))); 5257 jcc(Assembler::aboveEqual, ok); 5258 STOP("assert(top <= end)"); 5259 should_not_reach_here(); 5260 5261 bind(ok); 5262 NOT_LP64(pop(thread_reg)); 5263 pop(t1); 5264 } 5265 #endif 5266 } 5267 5268 class ControlWord { 5269 public: 5270 int32_t _value; 5271 5272 int rounding_control() const { return (_value >> 10) & 3 ; } 5273 int precision_control() const { return (_value >> 8) & 3 ; } 5274 bool precision() const { return ((_value >> 5) & 1) != 0; } 5275 bool underflow() const { return ((_value >> 4) & 1) != 0; } 5276 bool overflow() const { return ((_value >> 3) & 1) != 0; } 5277 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 5278 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 5279 bool invalid() const { return ((_value >> 0) & 1) != 0; } 5280 5281 void print() const { 5282 // rounding control 5283 const char* rc; 5284 switch (rounding_control()) { 5285 case 0: rc = "round near"; break; 5286 case 1: rc = "round down"; break; 5287 case 2: rc = "round up "; break; 5288 case 3: rc = "chop "; break; 5289 }; 5290 // precision control 5291 const char* pc; 5292 switch (precision_control()) { 5293 case 0: pc = "24 bits "; break; 5294 case 1: pc = "reserved"; break; 5295 case 2: pc = "53 bits "; break; 5296 case 3: pc = "64 bits "; break; 5297 }; 5298 // flags 5299 char f[9]; 5300 f[0] = ' '; 5301 f[1] = ' '; 5302 f[2] = (precision ()) ? 'P' : 'p'; 5303 f[3] = (underflow ()) ? 'U' : 'u'; 5304 f[4] = (overflow ()) ? 'O' : 'o'; 5305 f[5] = (zero_divide ()) ? 'Z' : 'z'; 5306 f[6] = (denormalized()) ? 'D' : 'd'; 5307 f[7] = (invalid ()) ? 'I' : 'i'; 5308 f[8] = '\x0'; 5309 // output 5310 printf("%04x masks = %s, %s, %s", _value & 0xFFFF, f, rc, pc); 5311 } 5312 5313 }; 5314 5315 class StatusWord { 5316 public: 5317 int32_t _value; 5318 5319 bool busy() const { return ((_value >> 15) & 1) != 0; } 5320 bool C3() const { return ((_value >> 14) & 1) != 0; } 5321 bool C2() const { return ((_value >> 10) & 1) != 0; } 5322 bool C1() const { return ((_value >> 9) & 1) != 0; } 5323 bool C0() const { return ((_value >> 8) & 1) != 0; } 5324 int top() const { return (_value >> 11) & 7 ; } 5325 bool error_status() const { return ((_value >> 7) & 1) != 0; } 5326 bool stack_fault() const { return ((_value >> 6) & 1) != 0; } 5327 bool precision() const { return ((_value >> 5) & 1) != 0; } 5328 bool underflow() const { return ((_value >> 4) & 1) != 0; } 5329 bool overflow() const { return ((_value >> 3) & 1) != 0; } 5330 bool zero_divide() const { return ((_value >> 2) & 1) != 0; } 5331 bool denormalized() const { return ((_value >> 1) & 1) != 0; } 5332 bool invalid() const { return ((_value >> 0) & 1) != 0; } 5333 5334 void print() const { 5335 // condition codes 5336 char c[5]; 5337 c[0] = (C3()) ? '3' : '-'; 5338 c[1] = (C2()) ? '2' : '-'; 5339 c[2] = (C1()) ? '1' : '-'; 5340 c[3] = (C0()) ? '0' : '-'; 5341 c[4] = '\x0'; 5342 // flags 5343 char f[9]; 5344 f[0] = (error_status()) ? 'E' : '-'; 5345 f[1] = (stack_fault ()) ? 'S' : '-'; 5346 f[2] = (precision ()) ? 'P' : '-'; 5347 f[3] = (underflow ()) ? 'U' : '-'; 5348 f[4] = (overflow ()) ? 'O' : '-'; 5349 f[5] = (zero_divide ()) ? 'Z' : '-'; 5350 f[6] = (denormalized()) ? 'D' : '-'; 5351 f[7] = (invalid ()) ? 'I' : '-'; 5352 f[8] = '\x0'; 5353 // output 5354 printf("%04x flags = %s, cc = %s, top = %d", _value & 0xFFFF, f, c, top()); 5355 } 5356 5357 }; 5358 5359 class TagWord { 5360 public: 5361 int32_t _value; 5362 5363 int tag_at(int i) const { return (_value >> (i*2)) & 3; } 5364 5365 void print() const { 5366 printf("%04x", _value & 0xFFFF); 5367 } 5368 5369 }; 5370 5371 class FPU_Register { 5372 public: 5373 int32_t _m0; 5374 int32_t _m1; 5375 int16_t _ex; 5376 5377 bool is_indefinite() const { 5378 return _ex == -1 && _m1 == (int32_t)0xC0000000 && _m0 == 0; 5379 } 5380 5381 void print() const { 5382 char sign = (_ex < 0) ? '-' : '+'; 5383 const char* kind = (_ex == 0x7FFF || _ex == (int16_t)-1) ? "NaN" : " "; 5384 printf("%c%04hx.%08x%08x %s", sign, _ex, _m1, _m0, kind); 5385 }; 5386 5387 }; 5388 5389 class FPU_State { 5390 public: 5391 enum { 5392 register_size = 10, 5393 number_of_registers = 8, 5394 register_mask = 7 5395 }; 5396 5397 ControlWord _control_word; 5398 StatusWord _status_word; 5399 TagWord _tag_word; 5400 int32_t _error_offset; 5401 int32_t _error_selector; 5402 int32_t _data_offset; 5403 int32_t _data_selector; 5404 int8_t _register[register_size * number_of_registers]; 5405 5406 int tag_for_st(int i) const { return _tag_word.tag_at((_status_word.top() + i) & register_mask); } 5407 FPU_Register* st(int i) const { return (FPU_Register*)&_register[register_size * i]; } 5408 5409 const char* tag_as_string(int tag) const { 5410 switch (tag) { 5411 case 0: return "valid"; 5412 case 1: return "zero"; 5413 case 2: return "special"; 5414 case 3: return "empty"; 5415 } 5416 ShouldNotReachHere(); 5417 return NULL; 5418 } 5419 5420 void print() const { 5421 // print computation registers 5422 { int t = _status_word.top(); 5423 for (int i = 0; i < number_of_registers; i++) { 5424 int j = (i - t) & register_mask; 5425 printf("%c r%d = ST%d = ", (j == 0 ? '*' : ' '), i, j); 5426 st(j)->print(); 5427 printf(" %s\n", tag_as_string(_tag_word.tag_at(i))); 5428 } 5429 } 5430 printf("\n"); 5431 // print control registers 5432 printf("ctrl = "); _control_word.print(); printf("\n"); 5433 printf("stat = "); _status_word .print(); printf("\n"); 5434 printf("tags = "); _tag_word .print(); printf("\n"); 5435 } 5436 5437 }; 5438 5439 class Flag_Register { 5440 public: 5441 int32_t _value; 5442 5443 bool overflow() const { return ((_value >> 11) & 1) != 0; } 5444 bool direction() const { return ((_value >> 10) & 1) != 0; } 5445 bool sign() const { return ((_value >> 7) & 1) != 0; } 5446 bool zero() const { return ((_value >> 6) & 1) != 0; } 5447 bool auxiliary_carry() const { return ((_value >> 4) & 1) != 0; } 5448 bool parity() const { return ((_value >> 2) & 1) != 0; } 5449 bool carry() const { return ((_value >> 0) & 1) != 0; } 5450 5451 void print() const { 5452 // flags 5453 char f[8]; 5454 f[0] = (overflow ()) ? 'O' : '-'; 5455 f[1] = (direction ()) ? 'D' : '-'; 5456 f[2] = (sign ()) ? 'S' : '-'; 5457 f[3] = (zero ()) ? 'Z' : '-'; 5458 f[4] = (auxiliary_carry()) ? 'A' : '-'; 5459 f[5] = (parity ()) ? 'P' : '-'; 5460 f[6] = (carry ()) ? 'C' : '-'; 5461 f[7] = '\x0'; 5462 // output 5463 printf("%08x flags = %s", _value, f); 5464 } 5465 5466 }; 5467 5468 class IU_Register { 5469 public: 5470 int32_t _value; 5471 5472 void print() const { 5473 printf("%08x %11d", _value, _value); 5474 } 5475 5476 }; 5477 5478 class IU_State { 5479 public: 5480 Flag_Register _eflags; 5481 IU_Register _rdi; 5482 IU_Register _rsi; 5483 IU_Register _rbp; 5484 IU_Register _rsp; 5485 IU_Register _rbx; 5486 IU_Register _rdx; 5487 IU_Register _rcx; 5488 IU_Register _rax; 5489 5490 void print() const { 5491 // computation registers 5492 printf("rax, = "); _rax.print(); printf("\n"); 5493 printf("rbx, = "); _rbx.print(); printf("\n"); 5494 printf("rcx = "); _rcx.print(); printf("\n"); 5495 printf("rdx = "); _rdx.print(); printf("\n"); 5496 printf("rdi = "); _rdi.print(); printf("\n"); 5497 printf("rsi = "); _rsi.print(); printf("\n"); 5498 printf("rbp, = "); _rbp.print(); printf("\n"); 5499 printf("rsp = "); _rsp.print(); printf("\n"); 5500 printf("\n"); 5501 // control registers 5502 printf("flgs = "); _eflags.print(); printf("\n"); 5503 } 5504 }; 5505 5506 5507 class CPU_State { 5508 public: 5509 FPU_State _fpu_state; 5510 IU_State _iu_state; 5511 5512 void print() const { 5513 printf("--------------------------------------------------\n"); 5514 _iu_state .print(); 5515 printf("\n"); 5516 _fpu_state.print(); 5517 printf("--------------------------------------------------\n"); 5518 } 5519 5520 }; 5521 5522 5523 static void _print_CPU_state(CPU_State* state) { 5524 state->print(); 5525 }; 5526 5527 5528 void MacroAssembler::print_CPU_state() { 5529 push_CPU_state(); 5530 push(rsp); // pass CPU state 5531 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _print_CPU_state))); 5532 addptr(rsp, wordSize); // discard argument 5533 pop_CPU_state(); 5534 } 5535 5536 5537 static bool _verify_FPU(int stack_depth, char* s, CPU_State* state) { 5538 static int counter = 0; 5539 FPU_State* fs = &state->_fpu_state; 5540 counter++; 5541 // For leaf calls, only verify that the top few elements remain empty. 5542 // We only need 1 empty at the top for C2 code. 5543 if( stack_depth < 0 ) { 5544 if( fs->tag_for_st(7) != 3 ) { 5545 printf("FPR7 not empty\n"); 5546 state->print(); 5547 assert(false, "error"); 5548 return false; 5549 } 5550 return true; // All other stack states do not matter 5551 } 5552 5553 assert((fs->_control_word._value & 0xffff) == StubRoutines::_fpu_cntrl_wrd_std, 5554 "bad FPU control word"); 5555 5556 // compute stack depth 5557 int i = 0; 5558 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) < 3) i++; 5559 int d = i; 5560 while (i < FPU_State::number_of_registers && fs->tag_for_st(i) == 3) i++; 5561 // verify findings 5562 if (i != FPU_State::number_of_registers) { 5563 // stack not contiguous 5564 printf("%s: stack not contiguous at ST%d\n", s, i); 5565 state->print(); 5566 assert(false, "error"); 5567 return false; 5568 } 5569 // check if computed stack depth corresponds to expected stack depth 5570 if (stack_depth < 0) { 5571 // expected stack depth is -stack_depth or less 5572 if (d > -stack_depth) { 5573 // too many elements on the stack 5574 printf("%s: <= %d stack elements expected but found %d\n", s, -stack_depth, d); 5575 state->print(); 5576 assert(false, "error"); 5577 return false; 5578 } 5579 } else { 5580 // expected stack depth is stack_depth 5581 if (d != stack_depth) { 5582 // wrong stack depth 5583 printf("%s: %d stack elements expected but found %d\n", s, stack_depth, d); 5584 state->print(); 5585 assert(false, "error"); 5586 return false; 5587 } 5588 } 5589 // everything is cool 5590 return true; 5591 } 5592 5593 5594 void MacroAssembler::verify_FPU(int stack_depth, const char* s) { 5595 if (!VerifyFPU) return; 5596 push_CPU_state(); 5597 push(rsp); // pass CPU state 5598 ExternalAddress msg((address) s); 5599 // pass message string s 5600 pushptr(msg.addr()); 5601 push(stack_depth); // pass stack depth 5602 call(RuntimeAddress(CAST_FROM_FN_PTR(address, _verify_FPU))); 5603 addptr(rsp, 3 * wordSize); // discard arguments 5604 // check for error 5605 { Label L; 5606 testl(rax, rax); 5607 jcc(Assembler::notZero, L); 5608 int3(); // break if error condition 5609 bind(L); 5610 } 5611 pop_CPU_state(); 5612 } 5613 5614 void MacroAssembler::restore_cpu_control_state_after_jni() { 5615 // Either restore the MXCSR register after returning from the JNI Call 5616 // or verify that it wasn't changed (with -Xcheck:jni flag). 5617 if (VM_Version::supports_sse()) { 5618 if (RestoreMXCSROnJNICalls) { 5619 ldmxcsr(ExternalAddress(StubRoutines::addr_mxcsr_std())); 5620 } else if (CheckJNICalls) { 5621 call(RuntimeAddress(StubRoutines::x86::verify_mxcsr_entry())); 5622 } 5623 } 5624 if (VM_Version::supports_avx()) { 5625 // Clear upper bits of YMM registers to avoid SSE <-> AVX transition penalty. 5626 vzeroupper(); 5627 } 5628 5629 #ifndef _LP64 5630 // Either restore the x87 floating pointer control word after returning 5631 // from the JNI call or verify that it wasn't changed. 5632 if (CheckJNICalls) { 5633 call(RuntimeAddress(StubRoutines::x86::verify_fpu_cntrl_wrd_entry())); 5634 } 5635 #endif // _LP64 5636 } 5637 5638 5639 void MacroAssembler::load_klass(Register dst, Register src) { 5640 #ifdef _LP64 5641 if (UseCompressedClassPointers) { 5642 movl(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5643 decode_klass_not_null(dst); 5644 } else 5645 #endif 5646 movptr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 5647 } 5648 5649 void MacroAssembler::load_prototype_header(Register dst, Register src) { 5650 load_klass(dst, src); 5651 movptr(dst, Address(dst, Klass::prototype_header_offset())); 5652 } 5653 5654 void MacroAssembler::store_klass(Register dst, Register src) { 5655 #ifdef _LP64 5656 if (UseCompressedClassPointers) { 5657 encode_klass_not_null(src); 5658 movl(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5659 } else 5660 #endif 5661 movptr(Address(dst, oopDesc::klass_offset_in_bytes()), src); 5662 } 5663 5664 void MacroAssembler::load_heap_oop(Register dst, Address src) { 5665 #ifdef _LP64 5666 // FIXME: Must change all places where we try to load the klass. 5667 if (UseCompressedOops) { 5668 movl(dst, src); 5669 decode_heap_oop(dst); 5670 } else 5671 #endif 5672 movptr(dst, src); 5673 } 5674 5675 // Doesn't do verfication, generates fixed size code 5676 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src) { 5677 #ifdef _LP64 5678 if (UseCompressedOops) { 5679 movl(dst, src); 5680 decode_heap_oop_not_null(dst); 5681 } else 5682 #endif 5683 movptr(dst, src); 5684 } 5685 5686 void MacroAssembler::store_heap_oop(Address dst, Register src) { 5687 #ifdef _LP64 5688 if (UseCompressedOops) { 5689 assert(!dst.uses(src), "not enough registers"); 5690 encode_heap_oop(src); 5691 movl(dst, src); 5692 } else 5693 #endif 5694 movptr(dst, src); 5695 } 5696 5697 void MacroAssembler::cmp_heap_oop(Register src1, Address src2, Register tmp) { 5698 assert_different_registers(src1, tmp); 5699 #ifdef _LP64 5700 if (UseCompressedOops) { 5701 bool did_push = false; 5702 if (tmp == noreg) { 5703 tmp = rax; 5704 push(tmp); 5705 did_push = true; 5706 assert(!src2.uses(rsp), "can't push"); 5707 } 5708 load_heap_oop(tmp, src2); 5709 cmpptr(src1, tmp); 5710 if (did_push) pop(tmp); 5711 } else 5712 #endif 5713 cmpptr(src1, src2); 5714 } 5715 5716 // Used for storing NULLs. 5717 void MacroAssembler::store_heap_oop_null(Address dst) { 5718 #ifdef _LP64 5719 if (UseCompressedOops) { 5720 movl(dst, (int32_t)NULL_WORD); 5721 } else { 5722 movslq(dst, (int32_t)NULL_WORD); 5723 } 5724 #else 5725 movl(dst, (int32_t)NULL_WORD); 5726 #endif 5727 } 5728 5729 #ifdef _LP64 5730 void MacroAssembler::store_klass_gap(Register dst, Register src) { 5731 if (UseCompressedClassPointers) { 5732 // Store to klass gap in destination 5733 movl(Address(dst, oopDesc::klass_gap_offset_in_bytes()), src); 5734 } 5735 } 5736 5737 #ifdef ASSERT 5738 void MacroAssembler::verify_heapbase(const char* msg) { 5739 assert (UseCompressedOops, "should be compressed"); 5740 assert (Universe::heap() != NULL, "java heap should be initialized"); 5741 if (CheckCompressedOops) { 5742 Label ok; 5743 push(rscratch1); // cmpptr trashes rscratch1 5744 cmpptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 5745 jcc(Assembler::equal, ok); 5746 STOP(msg); 5747 bind(ok); 5748 pop(rscratch1); 5749 } 5750 } 5751 #endif 5752 5753 // Algorithm must match oop.inline.hpp encode_heap_oop. 5754 void MacroAssembler::encode_heap_oop(Register r) { 5755 #ifdef ASSERT 5756 verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?"); 5757 #endif 5758 verify_oop(r, "broken oop in encode_heap_oop"); 5759 if (Universe::narrow_oop_base() == NULL) { 5760 if (Universe::narrow_oop_shift() != 0) { 5761 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5762 shrq(r, LogMinObjAlignmentInBytes); 5763 } 5764 return; 5765 } 5766 testq(r, r); 5767 cmovq(Assembler::equal, r, r12_heapbase); 5768 subq(r, r12_heapbase); 5769 shrq(r, LogMinObjAlignmentInBytes); 5770 } 5771 5772 void MacroAssembler::encode_heap_oop_not_null(Register r) { 5773 #ifdef ASSERT 5774 verify_heapbase("MacroAssembler::encode_heap_oop_not_null: heap base corrupted?"); 5775 if (CheckCompressedOops) { 5776 Label ok; 5777 testq(r, r); 5778 jcc(Assembler::notEqual, ok); 5779 STOP("null oop passed to encode_heap_oop_not_null"); 5780 bind(ok); 5781 } 5782 #endif 5783 verify_oop(r, "broken oop in encode_heap_oop_not_null"); 5784 if (Universe::narrow_oop_base() != NULL) { 5785 subq(r, r12_heapbase); 5786 } 5787 if (Universe::narrow_oop_shift() != 0) { 5788 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5789 shrq(r, LogMinObjAlignmentInBytes); 5790 } 5791 } 5792 5793 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) { 5794 #ifdef ASSERT 5795 verify_heapbase("MacroAssembler::encode_heap_oop_not_null2: heap base corrupted?"); 5796 if (CheckCompressedOops) { 5797 Label ok; 5798 testq(src, src); 5799 jcc(Assembler::notEqual, ok); 5800 STOP("null oop passed to encode_heap_oop_not_null2"); 5801 bind(ok); 5802 } 5803 #endif 5804 verify_oop(src, "broken oop in encode_heap_oop_not_null2"); 5805 if (dst != src) { 5806 movq(dst, src); 5807 } 5808 if (Universe::narrow_oop_base() != NULL) { 5809 subq(dst, r12_heapbase); 5810 } 5811 if (Universe::narrow_oop_shift() != 0) { 5812 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5813 shrq(dst, LogMinObjAlignmentInBytes); 5814 } 5815 } 5816 5817 void MacroAssembler::decode_heap_oop(Register r) { 5818 #ifdef ASSERT 5819 verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?"); 5820 #endif 5821 if (Universe::narrow_oop_base() == NULL) { 5822 if (Universe::narrow_oop_shift() != 0) { 5823 assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5824 shlq(r, LogMinObjAlignmentInBytes); 5825 } 5826 } else { 5827 Label done; 5828 shlq(r, LogMinObjAlignmentInBytes); 5829 jccb(Assembler::equal, done); 5830 addq(r, r12_heapbase); 5831 bind(done); 5832 } 5833 verify_oop(r, "broken oop in decode_heap_oop"); 5834 } 5835 5836 void MacroAssembler::decode_heap_oop_not_null(Register r) { 5837 // Note: it will change flags 5838 assert (UseCompressedOops, "should only be used for compressed headers"); 5839 assert (Universe::heap() != NULL, "java heap should be initialized"); 5840 // Cannot assert, unverified entry point counts instructions (see .ad file) 5841 // vtableStubs also counts instructions in pd_code_size_limit. 5842 // Also do not verify_oop as this is called by verify_oop. 5843 if (Universe::narrow_oop_shift() != 0) { 5844 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5845 shlq(r, LogMinObjAlignmentInBytes); 5846 if (Universe::narrow_oop_base() != NULL) { 5847 addq(r, r12_heapbase); 5848 } 5849 } else { 5850 assert (Universe::narrow_oop_base() == NULL, "sanity"); 5851 } 5852 } 5853 5854 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) { 5855 // Note: it will change flags 5856 assert (UseCompressedOops, "should only be used for compressed headers"); 5857 assert (Universe::heap() != NULL, "java heap should be initialized"); 5858 // Cannot assert, unverified entry point counts instructions (see .ad file) 5859 // vtableStubs also counts instructions in pd_code_size_limit. 5860 // Also do not verify_oop as this is called by verify_oop. 5861 if (Universe::narrow_oop_shift() != 0) { 5862 assert(LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong"); 5863 if (LogMinObjAlignmentInBytes == Address::times_8) { 5864 leaq(dst, Address(r12_heapbase, src, Address::times_8, 0)); 5865 } else { 5866 if (dst != src) { 5867 movq(dst, src); 5868 } 5869 shlq(dst, LogMinObjAlignmentInBytes); 5870 if (Universe::narrow_oop_base() != NULL) { 5871 addq(dst, r12_heapbase); 5872 } 5873 } 5874 } else { 5875 assert (Universe::narrow_oop_base() == NULL, "sanity"); 5876 if (dst != src) { 5877 movq(dst, src); 5878 } 5879 } 5880 } 5881 5882 void MacroAssembler::encode_klass_not_null(Register r) { 5883 if (Universe::narrow_klass_base() != NULL) { 5884 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base. 5885 assert(r != r12_heapbase, "Encoding a klass in r12"); 5886 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base()); 5887 subq(r, r12_heapbase); 5888 } 5889 if (Universe::narrow_klass_shift() != 0) { 5890 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 5891 shrq(r, LogKlassAlignmentInBytes); 5892 } 5893 if (Universe::narrow_klass_base() != NULL) { 5894 reinit_heapbase(); 5895 } 5896 } 5897 5898 void MacroAssembler::encode_klass_not_null(Register dst, Register src) { 5899 if (dst == src) { 5900 encode_klass_not_null(src); 5901 } else { 5902 if (Universe::narrow_klass_base() != NULL) { 5903 mov64(dst, (int64_t)Universe::narrow_klass_base()); 5904 negq(dst); 5905 addq(dst, src); 5906 } else { 5907 movptr(dst, src); 5908 } 5909 if (Universe::narrow_klass_shift() != 0) { 5910 assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 5911 shrq(dst, LogKlassAlignmentInBytes); 5912 } 5913 } 5914 } 5915 5916 // Function instr_size_for_decode_klass_not_null() counts the instructions 5917 // generated by decode_klass_not_null(register r) and reinit_heapbase(), 5918 // when (Universe::heap() != NULL). Hence, if the instructions they 5919 // generate change, then this method needs to be updated. 5920 int MacroAssembler::instr_size_for_decode_klass_not_null() { 5921 assert (UseCompressedClassPointers, "only for compressed klass ptrs"); 5922 if (Universe::narrow_klass_base() != NULL) { 5923 // mov64 + addq + shlq? + mov64 (for reinit_heapbase()). 5924 return (Universe::narrow_klass_shift() == 0 ? 20 : 24); 5925 } else { 5926 // longest load decode klass function, mov64, leaq 5927 return 16; 5928 } 5929 } 5930 5931 // !!! If the instructions that get generated here change then function 5932 // instr_size_for_decode_klass_not_null() needs to get updated. 5933 void MacroAssembler::decode_klass_not_null(Register r) { 5934 // Note: it will change flags 5935 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5936 assert(r != r12_heapbase, "Decoding a klass in r12"); 5937 // Cannot assert, unverified entry point counts instructions (see .ad file) 5938 // vtableStubs also counts instructions in pd_code_size_limit. 5939 // Also do not verify_oop as this is called by verify_oop. 5940 if (Universe::narrow_klass_shift() != 0) { 5941 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 5942 shlq(r, LogKlassAlignmentInBytes); 5943 } 5944 // Use r12 as a scratch register in which to temporarily load the narrow_klass_base. 5945 if (Universe::narrow_klass_base() != NULL) { 5946 mov64(r12_heapbase, (int64_t)Universe::narrow_klass_base()); 5947 addq(r, r12_heapbase); 5948 reinit_heapbase(); 5949 } 5950 } 5951 5952 void MacroAssembler::decode_klass_not_null(Register dst, Register src) { 5953 // Note: it will change flags 5954 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5955 if (dst == src) { 5956 decode_klass_not_null(dst); 5957 } else { 5958 // Cannot assert, unverified entry point counts instructions (see .ad file) 5959 // vtableStubs also counts instructions in pd_code_size_limit. 5960 // Also do not verify_oop as this is called by verify_oop. 5961 mov64(dst, (int64_t)Universe::narrow_klass_base()); 5962 if (Universe::narrow_klass_shift() != 0) { 5963 assert(LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong"); 5964 assert(LogKlassAlignmentInBytes == Address::times_8, "klass not aligned on 64bits?"); 5965 leaq(dst, Address(dst, src, Address::times_8, 0)); 5966 } else { 5967 addq(dst, src); 5968 } 5969 } 5970 } 5971 5972 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) { 5973 assert (UseCompressedOops, "should only be used for compressed headers"); 5974 assert (Universe::heap() != NULL, "java heap should be initialized"); 5975 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5976 int oop_index = oop_recorder()->find_index(obj); 5977 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5978 mov_narrow_oop(dst, oop_index, rspec); 5979 } 5980 5981 void MacroAssembler::set_narrow_oop(Address dst, jobject obj) { 5982 assert (UseCompressedOops, "should only be used for compressed headers"); 5983 assert (Universe::heap() != NULL, "java heap should be initialized"); 5984 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5985 int oop_index = oop_recorder()->find_index(obj); 5986 RelocationHolder rspec = oop_Relocation::spec(oop_index); 5987 mov_narrow_oop(dst, oop_index, rspec); 5988 } 5989 5990 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) { 5991 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 5992 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 5993 int klass_index = oop_recorder()->find_index(k); 5994 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 5995 mov_narrow_oop(dst, Klass::encode_klass(k), rspec); 5996 } 5997 5998 void MacroAssembler::set_narrow_klass(Address dst, Klass* k) { 5999 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6000 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 6001 int klass_index = oop_recorder()->find_index(k); 6002 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6003 mov_narrow_oop(dst, Klass::encode_klass(k), rspec); 6004 } 6005 6006 void MacroAssembler::cmp_narrow_oop(Register dst, jobject obj) { 6007 assert (UseCompressedOops, "should only be used for compressed headers"); 6008 assert (Universe::heap() != NULL, "java heap should be initialized"); 6009 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 6010 int oop_index = oop_recorder()->find_index(obj); 6011 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6012 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6013 } 6014 6015 void MacroAssembler::cmp_narrow_oop(Address dst, jobject obj) { 6016 assert (UseCompressedOops, "should only be used for compressed headers"); 6017 assert (Universe::heap() != NULL, "java heap should be initialized"); 6018 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 6019 int oop_index = oop_recorder()->find_index(obj); 6020 RelocationHolder rspec = oop_Relocation::spec(oop_index); 6021 Assembler::cmp_narrow_oop(dst, oop_index, rspec); 6022 } 6023 6024 void MacroAssembler::cmp_narrow_klass(Register dst, Klass* k) { 6025 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6026 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 6027 int klass_index = oop_recorder()->find_index(k); 6028 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6029 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec); 6030 } 6031 6032 void MacroAssembler::cmp_narrow_klass(Address dst, Klass* k) { 6033 assert (UseCompressedClassPointers, "should only be used for compressed headers"); 6034 assert (oop_recorder() != NULL, "this assembler needs an OopRecorder"); 6035 int klass_index = oop_recorder()->find_index(k); 6036 RelocationHolder rspec = metadata_Relocation::spec(klass_index); 6037 Assembler::cmp_narrow_oop(dst, Klass::encode_klass(k), rspec); 6038 } 6039 6040 void MacroAssembler::reinit_heapbase() { 6041 if (UseCompressedOops || UseCompressedClassPointers) { 6042 if (Universe::heap() != NULL) { 6043 if (Universe::narrow_oop_base() == NULL) { 6044 MacroAssembler::xorptr(r12_heapbase, r12_heapbase); 6045 } else { 6046 mov64(r12_heapbase, (int64_t)Universe::narrow_ptrs_base()); 6047 } 6048 } else { 6049 movptr(r12_heapbase, ExternalAddress((address)Universe::narrow_ptrs_base_addr())); 6050 } 6051 } 6052 } 6053 6054 #endif // _LP64 6055 6056 6057 // C2 compiled method's prolog code. 6058 void MacroAssembler::verified_entry(int framesize, int stack_bang_size, bool fp_mode_24b) { 6059 6060 // WARNING: Initial instruction MUST be 5 bytes or longer so that 6061 // NativeJump::patch_verified_entry will be able to patch out the entry 6062 // code safely. The push to verify stack depth is ok at 5 bytes, 6063 // the frame allocation can be either 3 or 6 bytes. So if we don't do 6064 // stack bang then we must use the 6 byte frame allocation even if 6065 // we have no frame. :-( 6066 assert(stack_bang_size >= framesize || stack_bang_size <= 0, "stack bang size incorrect"); 6067 6068 assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned"); 6069 // Remove word for return addr 6070 framesize -= wordSize; 6071 stack_bang_size -= wordSize; 6072 6073 // Calls to C2R adapters often do not accept exceptional returns. 6074 // We require that their callers must bang for them. But be careful, because 6075 // some VM calls (such as call site linkage) can use several kilobytes of 6076 // stack. But the stack safety zone should account for that. 6077 // See bugs 4446381, 4468289, 4497237. 6078 if (stack_bang_size > 0) { 6079 generate_stack_overflow_check(stack_bang_size); 6080 6081 // We always push rbp, so that on return to interpreter rbp, will be 6082 // restored correctly and we can correct the stack. 6083 push(rbp); 6084 // Save caller's stack pointer into RBP if the frame pointer is preserved. 6085 if (PreserveFramePointer) { 6086 mov(rbp, rsp); 6087 } 6088 // Remove word for ebp 6089 framesize -= wordSize; 6090 6091 // Create frame 6092 if (framesize) { 6093 subptr(rsp, framesize); 6094 } 6095 } else { 6096 // Create frame (force generation of a 4 byte immediate value) 6097 subptr_imm32(rsp, framesize); 6098 6099 // Save RBP register now. 6100 framesize -= wordSize; 6101 movptr(Address(rsp, framesize), rbp); 6102 // Save caller's stack pointer into RBP if the frame pointer is preserved. 6103 if (PreserveFramePointer) { 6104 movptr(rbp, rsp); 6105 addptr(rbp, framesize + wordSize); 6106 } 6107 } 6108 6109 if (VerifyStackAtCalls) { // Majik cookie to verify stack depth 6110 framesize -= wordSize; 6111 movptr(Address(rsp, framesize), (int32_t)0xbadb100d); 6112 } 6113 6114 #ifndef _LP64 6115 // If method sets FPU control word do it now 6116 if (fp_mode_24b) { 6117 fldcw(ExternalAddress(StubRoutines::addr_fpu_cntrl_wrd_24())); 6118 } 6119 if (UseSSE >= 2 && VerifyFPU) { 6120 verify_FPU(0, "FPU stack must be clean on entry"); 6121 } 6122 #endif 6123 6124 #ifdef ASSERT 6125 if (VerifyStackAtCalls) { 6126 Label L; 6127 push(rax); 6128 mov(rax, rsp); 6129 andptr(rax, StackAlignmentInBytes-1); 6130 cmpptr(rax, StackAlignmentInBytes-wordSize); 6131 pop(rax); 6132 jcc(Assembler::equal, L); 6133 STOP("Stack is not properly aligned!"); 6134 bind(L); 6135 } 6136 #endif 6137 6138 } 6139 6140 void MacroAssembler::clear_mem(Register base, Register cnt, Register tmp) { 6141 // cnt - number of qwords (8-byte words). 6142 // base - start address, qword aligned. 6143 assert(base==rdi, "base register must be edi for rep stos"); 6144 assert(tmp==rax, "tmp register must be eax for rep stos"); 6145 assert(cnt==rcx, "cnt register must be ecx for rep stos"); 6146 6147 xorptr(tmp, tmp); 6148 if (UseFastStosb) { 6149 shlptr(cnt,3); // convert to number of bytes 6150 rep_stosb(); 6151 } else { 6152 NOT_LP64(shlptr(cnt,1);) // convert to number of dwords for 32-bit VM 6153 rep_stos(); 6154 } 6155 } 6156 6157 // IndexOf for constant substrings with size >= 8 chars 6158 // which don't need to be loaded through stack. 6159 void MacroAssembler::string_indexofC8(Register str1, Register str2, 6160 Register cnt1, Register cnt2, 6161 int int_cnt2, Register result, 6162 XMMRegister vec, Register tmp) { 6163 ShortBranchVerifier sbv(this); 6164 assert(UseSSE42Intrinsics, "SSE4.2 is required"); 6165 6166 // This method uses pcmpestri instruction with bound registers 6167 // inputs: 6168 // xmm - substring 6169 // rax - substring length (elements count) 6170 // mem - scanned string 6171 // rdx - string length (elements count) 6172 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 6173 // outputs: 6174 // rcx - matched index in string 6175 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 6176 6177 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, 6178 RET_FOUND, RET_NOT_FOUND, EXIT, FOUND_SUBSTR, 6179 MATCH_SUBSTR_HEAD, RELOAD_STR, FOUND_CANDIDATE; 6180 6181 // Note, inline_string_indexOf() generates checks: 6182 // if (substr.count > string.count) return -1; 6183 // if (substr.count == 0) return 0; 6184 assert(int_cnt2 >= 8, "this code isused only for cnt2 >= 8 chars"); 6185 6186 // Load substring. 6187 movdqu(vec, Address(str2, 0)); 6188 movl(cnt2, int_cnt2); 6189 movptr(result, str1); // string addr 6190 6191 if (int_cnt2 > 8) { 6192 jmpb(SCAN_TO_SUBSTR); 6193 6194 // Reload substr for rescan, this code 6195 // is executed only for large substrings (> 8 chars) 6196 bind(RELOAD_SUBSTR); 6197 movdqu(vec, Address(str2, 0)); 6198 negptr(cnt2); // Jumped here with negative cnt2, convert to positive 6199 6200 bind(RELOAD_STR); 6201 // We came here after the beginning of the substring was 6202 // matched but the rest of it was not so we need to search 6203 // again. Start from the next element after the previous match. 6204 6205 // cnt2 is number of substring reminding elements and 6206 // cnt1 is number of string reminding elements when cmp failed. 6207 // Restored cnt1 = cnt1 - cnt2 + int_cnt2 6208 subl(cnt1, cnt2); 6209 addl(cnt1, int_cnt2); 6210 movl(cnt2, int_cnt2); // Now restore cnt2 6211 6212 decrementl(cnt1); // Shift to next element 6213 cmpl(cnt1, cnt2); 6214 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 6215 6216 addptr(result, 2); 6217 6218 } // (int_cnt2 > 8) 6219 6220 // Scan string for start of substr in 16-byte vectors 6221 bind(SCAN_TO_SUBSTR); 6222 pcmpestri(vec, Address(result, 0), 0x0d); 6223 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 6224 subl(cnt1, 8); 6225 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 6226 cmpl(cnt1, cnt2); 6227 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 6228 addptr(result, 16); 6229 jmpb(SCAN_TO_SUBSTR); 6230 6231 // Found a potential substr 6232 bind(FOUND_CANDIDATE); 6233 // Matched whole vector if first element matched (tmp(rcx) == 0). 6234 if (int_cnt2 == 8) { 6235 jccb(Assembler::overflow, RET_FOUND); // OF == 1 6236 } else { // int_cnt2 > 8 6237 jccb(Assembler::overflow, FOUND_SUBSTR); 6238 } 6239 // After pcmpestri tmp(rcx) contains matched element index 6240 // Compute start addr of substr 6241 lea(result, Address(result, tmp, Address::times_2)); 6242 6243 // Make sure string is still long enough 6244 subl(cnt1, tmp); 6245 cmpl(cnt1, cnt2); 6246 if (int_cnt2 == 8) { 6247 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 6248 } else { // int_cnt2 > 8 6249 jccb(Assembler::greaterEqual, MATCH_SUBSTR_HEAD); 6250 } 6251 // Left less then substring. 6252 6253 bind(RET_NOT_FOUND); 6254 movl(result, -1); 6255 jmpb(EXIT); 6256 6257 if (int_cnt2 > 8) { 6258 // This code is optimized for the case when whole substring 6259 // is matched if its head is matched. 6260 bind(MATCH_SUBSTR_HEAD); 6261 pcmpestri(vec, Address(result, 0), 0x0d); 6262 // Reload only string if does not match 6263 jccb(Assembler::noOverflow, RELOAD_STR); // OF == 0 6264 6265 Label CONT_SCAN_SUBSTR; 6266 // Compare the rest of substring (> 8 chars). 6267 bind(FOUND_SUBSTR); 6268 // First 8 chars are already matched. 6269 negptr(cnt2); 6270 addptr(cnt2, 8); 6271 6272 bind(SCAN_SUBSTR); 6273 subl(cnt1, 8); 6274 cmpl(cnt2, -8); // Do not read beyond substring 6275 jccb(Assembler::lessEqual, CONT_SCAN_SUBSTR); 6276 // Back-up strings to avoid reading beyond substring: 6277 // cnt1 = cnt1 - cnt2 + 8 6278 addl(cnt1, cnt2); // cnt2 is negative 6279 addl(cnt1, 8); 6280 movl(cnt2, 8); negptr(cnt2); 6281 bind(CONT_SCAN_SUBSTR); 6282 if (int_cnt2 < (int)G) { 6283 movdqu(vec, Address(str2, cnt2, Address::times_2, int_cnt2*2)); 6284 pcmpestri(vec, Address(result, cnt2, Address::times_2, int_cnt2*2), 0x0d); 6285 } else { 6286 // calculate index in register to avoid integer overflow (int_cnt2*2) 6287 movl(tmp, int_cnt2); 6288 addptr(tmp, cnt2); 6289 movdqu(vec, Address(str2, tmp, Address::times_2, 0)); 6290 pcmpestri(vec, Address(result, tmp, Address::times_2, 0), 0x0d); 6291 } 6292 // Need to reload strings pointers if not matched whole vector 6293 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 6294 addptr(cnt2, 8); 6295 jcc(Assembler::negative, SCAN_SUBSTR); 6296 // Fall through if found full substring 6297 6298 } // (int_cnt2 > 8) 6299 6300 bind(RET_FOUND); 6301 // Found result if we matched full small substring. 6302 // Compute substr offset 6303 subptr(result, str1); 6304 shrl(result, 1); // index 6305 bind(EXIT); 6306 6307 } // string_indexofC8 6308 6309 // Small strings are loaded through stack if they cross page boundary. 6310 void MacroAssembler::string_indexof(Register str1, Register str2, 6311 Register cnt1, Register cnt2, 6312 int int_cnt2, Register result, 6313 XMMRegister vec, Register tmp) { 6314 ShortBranchVerifier sbv(this); 6315 assert(UseSSE42Intrinsics, "SSE4.2 is required"); 6316 // 6317 // int_cnt2 is length of small (< 8 chars) constant substring 6318 // or (-1) for non constant substring in which case its length 6319 // is in cnt2 register. 6320 // 6321 // Note, inline_string_indexOf() generates checks: 6322 // if (substr.count > string.count) return -1; 6323 // if (substr.count == 0) return 0; 6324 // 6325 assert(int_cnt2 == -1 || (0 < int_cnt2 && int_cnt2 < 8), "should be != 0"); 6326 6327 // This method uses pcmpestri instruction with bound registers 6328 // inputs: 6329 // xmm - substring 6330 // rax - substring length (elements count) 6331 // mem - scanned string 6332 // rdx - string length (elements count) 6333 // 0xd - mode: 1100 (substring search) + 01 (unsigned shorts) 6334 // outputs: 6335 // rcx - matched index in string 6336 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 6337 6338 Label RELOAD_SUBSTR, SCAN_TO_SUBSTR, SCAN_SUBSTR, ADJUST_STR, 6339 RET_FOUND, RET_NOT_FOUND, CLEANUP, FOUND_SUBSTR, 6340 FOUND_CANDIDATE; 6341 6342 { //======================================================== 6343 // We don't know where these strings are located 6344 // and we can't read beyond them. Load them through stack. 6345 Label BIG_STRINGS, CHECK_STR, COPY_SUBSTR, COPY_STR; 6346 6347 movptr(tmp, rsp); // save old SP 6348 6349 if (int_cnt2 > 0) { // small (< 8 chars) constant substring 6350 if (int_cnt2 == 1) { // One char 6351 load_unsigned_short(result, Address(str2, 0)); 6352 movdl(vec, result); // move 32 bits 6353 } else if (int_cnt2 == 2) { // Two chars 6354 movdl(vec, Address(str2, 0)); // move 32 bits 6355 } else if (int_cnt2 == 4) { // Four chars 6356 movq(vec, Address(str2, 0)); // move 64 bits 6357 } else { // cnt2 = { 3, 5, 6, 7 } 6358 // Array header size is 12 bytes in 32-bit VM 6359 // + 6 bytes for 3 chars == 18 bytes, 6360 // enough space to load vec and shift. 6361 assert(HeapWordSize*TypeArrayKlass::header_size() >= 12,"sanity"); 6362 movdqu(vec, Address(str2, (int_cnt2*2)-16)); 6363 psrldq(vec, 16-(int_cnt2*2)); 6364 } 6365 } else { // not constant substring 6366 cmpl(cnt2, 8); 6367 jccb(Assembler::aboveEqual, BIG_STRINGS); // Both strings are big enough 6368 6369 // We can read beyond string if srt+16 does not cross page boundary 6370 // since heaps are aligned and mapped by pages. 6371 assert(os::vm_page_size() < (int)G, "default page should be small"); 6372 movl(result, str2); // We need only low 32 bits 6373 andl(result, (os::vm_page_size()-1)); 6374 cmpl(result, (os::vm_page_size()-16)); 6375 jccb(Assembler::belowEqual, CHECK_STR); 6376 6377 // Move small strings to stack to allow load 16 bytes into vec. 6378 subptr(rsp, 16); 6379 int stk_offset = wordSize-2; 6380 push(cnt2); 6381 6382 bind(COPY_SUBSTR); 6383 load_unsigned_short(result, Address(str2, cnt2, Address::times_2, -2)); 6384 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result); 6385 decrement(cnt2); 6386 jccb(Assembler::notZero, COPY_SUBSTR); 6387 6388 pop(cnt2); 6389 movptr(str2, rsp); // New substring address 6390 } // non constant 6391 6392 bind(CHECK_STR); 6393 cmpl(cnt1, 8); 6394 jccb(Assembler::aboveEqual, BIG_STRINGS); 6395 6396 // Check cross page boundary. 6397 movl(result, str1); // We need only low 32 bits 6398 andl(result, (os::vm_page_size()-1)); 6399 cmpl(result, (os::vm_page_size()-16)); 6400 jccb(Assembler::belowEqual, BIG_STRINGS); 6401 6402 subptr(rsp, 16); 6403 int stk_offset = -2; 6404 if (int_cnt2 < 0) { // not constant 6405 push(cnt2); 6406 stk_offset += wordSize; 6407 } 6408 movl(cnt2, cnt1); 6409 6410 bind(COPY_STR); 6411 load_unsigned_short(result, Address(str1, cnt2, Address::times_2, -2)); 6412 movw(Address(rsp, cnt2, Address::times_2, stk_offset), result); 6413 decrement(cnt2); 6414 jccb(Assembler::notZero, COPY_STR); 6415 6416 if (int_cnt2 < 0) { // not constant 6417 pop(cnt2); 6418 } 6419 movptr(str1, rsp); // New string address 6420 6421 bind(BIG_STRINGS); 6422 // Load substring. 6423 if (int_cnt2 < 0) { // -1 6424 movdqu(vec, Address(str2, 0)); 6425 push(cnt2); // substr count 6426 push(str2); // substr addr 6427 push(str1); // string addr 6428 } else { 6429 // Small (< 8 chars) constant substrings are loaded already. 6430 movl(cnt2, int_cnt2); 6431 } 6432 push(tmp); // original SP 6433 6434 } // Finished loading 6435 6436 //======================================================== 6437 // Start search 6438 // 6439 6440 movptr(result, str1); // string addr 6441 6442 if (int_cnt2 < 0) { // Only for non constant substring 6443 jmpb(SCAN_TO_SUBSTR); 6444 6445 // SP saved at sp+0 6446 // String saved at sp+1*wordSize 6447 // Substr saved at sp+2*wordSize 6448 // Substr count saved at sp+3*wordSize 6449 6450 // Reload substr for rescan, this code 6451 // is executed only for large substrings (> 8 chars) 6452 bind(RELOAD_SUBSTR); 6453 movptr(str2, Address(rsp, 2*wordSize)); 6454 movl(cnt2, Address(rsp, 3*wordSize)); 6455 movdqu(vec, Address(str2, 0)); 6456 // We came here after the beginning of the substring was 6457 // matched but the rest of it was not so we need to search 6458 // again. Start from the next element after the previous match. 6459 subptr(str1, result); // Restore counter 6460 shrl(str1, 1); 6461 addl(cnt1, str1); 6462 decrementl(cnt1); // Shift to next element 6463 cmpl(cnt1, cnt2); 6464 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 6465 6466 addptr(result, 2); 6467 } // non constant 6468 6469 // Scan string for start of substr in 16-byte vectors 6470 bind(SCAN_TO_SUBSTR); 6471 assert(cnt1 == rdx && cnt2 == rax && tmp == rcx, "pcmpestri"); 6472 pcmpestri(vec, Address(result, 0), 0x0d); 6473 jccb(Assembler::below, FOUND_CANDIDATE); // CF == 1 6474 subl(cnt1, 8); 6475 jccb(Assembler::lessEqual, RET_NOT_FOUND); // Scanned full string 6476 cmpl(cnt1, cnt2); 6477 jccb(Assembler::negative, RET_NOT_FOUND); // Left less then substring 6478 addptr(result, 16); 6479 6480 bind(ADJUST_STR); 6481 cmpl(cnt1, 8); // Do not read beyond string 6482 jccb(Assembler::greaterEqual, SCAN_TO_SUBSTR); 6483 // Back-up string to avoid reading beyond string. 6484 lea(result, Address(result, cnt1, Address::times_2, -16)); 6485 movl(cnt1, 8); 6486 jmpb(SCAN_TO_SUBSTR); 6487 6488 // Found a potential substr 6489 bind(FOUND_CANDIDATE); 6490 // After pcmpestri tmp(rcx) contains matched element index 6491 6492 // Make sure string is still long enough 6493 subl(cnt1, tmp); 6494 cmpl(cnt1, cnt2); 6495 jccb(Assembler::greaterEqual, FOUND_SUBSTR); 6496 // Left less then substring. 6497 6498 bind(RET_NOT_FOUND); 6499 movl(result, -1); 6500 jmpb(CLEANUP); 6501 6502 bind(FOUND_SUBSTR); 6503 // Compute start addr of substr 6504 lea(result, Address(result, tmp, Address::times_2)); 6505 6506 if (int_cnt2 > 0) { // Constant substring 6507 // Repeat search for small substring (< 8 chars) 6508 // from new point without reloading substring. 6509 // Have to check that we don't read beyond string. 6510 cmpl(tmp, 8-int_cnt2); 6511 jccb(Assembler::greater, ADJUST_STR); 6512 // Fall through if matched whole substring. 6513 } else { // non constant 6514 assert(int_cnt2 == -1, "should be != 0"); 6515 6516 addl(tmp, cnt2); 6517 // Found result if we matched whole substring. 6518 cmpl(tmp, 8); 6519 jccb(Assembler::lessEqual, RET_FOUND); 6520 6521 // Repeat search for small substring (<= 8 chars) 6522 // from new point 'str1' without reloading substring. 6523 cmpl(cnt2, 8); 6524 // Have to check that we don't read beyond string. 6525 jccb(Assembler::lessEqual, ADJUST_STR); 6526 6527 Label CHECK_NEXT, CONT_SCAN_SUBSTR, RET_FOUND_LONG; 6528 // Compare the rest of substring (> 8 chars). 6529 movptr(str1, result); 6530 6531 cmpl(tmp, cnt2); 6532 // First 8 chars are already matched. 6533 jccb(Assembler::equal, CHECK_NEXT); 6534 6535 bind(SCAN_SUBSTR); 6536 pcmpestri(vec, Address(str1, 0), 0x0d); 6537 // Need to reload strings pointers if not matched whole vector 6538 jcc(Assembler::noOverflow, RELOAD_SUBSTR); // OF == 0 6539 6540 bind(CHECK_NEXT); 6541 subl(cnt2, 8); 6542 jccb(Assembler::lessEqual, RET_FOUND_LONG); // Found full substring 6543 addptr(str1, 16); 6544 addptr(str2, 16); 6545 subl(cnt1, 8); 6546 cmpl(cnt2, 8); // Do not read beyond substring 6547 jccb(Assembler::greaterEqual, CONT_SCAN_SUBSTR); 6548 // Back-up strings to avoid reading beyond substring. 6549 lea(str2, Address(str2, cnt2, Address::times_2, -16)); 6550 lea(str1, Address(str1, cnt2, Address::times_2, -16)); 6551 subl(cnt1, cnt2); 6552 movl(cnt2, 8); 6553 addl(cnt1, 8); 6554 bind(CONT_SCAN_SUBSTR); 6555 movdqu(vec, Address(str2, 0)); 6556 jmpb(SCAN_SUBSTR); 6557 6558 bind(RET_FOUND_LONG); 6559 movptr(str1, Address(rsp, wordSize)); 6560 } // non constant 6561 6562 bind(RET_FOUND); 6563 // Compute substr offset 6564 subptr(result, str1); 6565 shrl(result, 1); // index 6566 6567 bind(CLEANUP); 6568 pop(rsp); // restore SP 6569 6570 } // string_indexof 6571 6572 // Compare strings. 6573 void MacroAssembler::string_compare(Register str1, Register str2, 6574 Register cnt1, Register cnt2, Register result, 6575 XMMRegister vec1) { 6576 ShortBranchVerifier sbv(this); 6577 Label LENGTH_DIFF_LABEL, POP_LABEL, DONE_LABEL, WHILE_HEAD_LABEL; 6578 6579 // Compute the minimum of the string lengths and the 6580 // difference of the string lengths (stack). 6581 // Do the conditional move stuff 6582 movl(result, cnt1); 6583 subl(cnt1, cnt2); 6584 push(cnt1); 6585 cmov32(Assembler::lessEqual, cnt2, result); 6586 6587 // Is the minimum length zero? 6588 testl(cnt2, cnt2); 6589 jcc(Assembler::zero, LENGTH_DIFF_LABEL); 6590 6591 // Compare first characters 6592 load_unsigned_short(result, Address(str1, 0)); 6593 load_unsigned_short(cnt1, Address(str2, 0)); 6594 subl(result, cnt1); 6595 jcc(Assembler::notZero, POP_LABEL); 6596 cmpl(cnt2, 1); 6597 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 6598 6599 // Check if the strings start at the same location. 6600 cmpptr(str1, str2); 6601 jcc(Assembler::equal, LENGTH_DIFF_LABEL); 6602 6603 Address::ScaleFactor scale = Address::times_2; 6604 int stride = 8; 6605 6606 if (UseAVX >= 2 && UseSSE42Intrinsics) { 6607 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_WIDE_TAIL, COMPARE_SMALL_STR; 6608 Label COMPARE_WIDE_VECTORS_LOOP, COMPARE_16_CHARS, COMPARE_INDEX_CHAR; 6609 Label COMPARE_TAIL_LONG; 6610 int pcmpmask = 0x19; 6611 6612 // Setup to compare 16-chars (32-bytes) vectors, 6613 // start from first character again because it has aligned address. 6614 int stride2 = 16; 6615 int adr_stride = stride << scale; 6616 6617 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 6618 // rax and rdx are used by pcmpestri as elements counters 6619 movl(result, cnt2); 6620 andl(cnt2, ~(stride2-1)); // cnt2 holds the vector count 6621 jcc(Assembler::zero, COMPARE_TAIL_LONG); 6622 6623 // fast path : compare first 2 8-char vectors. 6624 bind(COMPARE_16_CHARS); 6625 movdqu(vec1, Address(str1, 0)); 6626 pcmpestri(vec1, Address(str2, 0), pcmpmask); 6627 jccb(Assembler::below, COMPARE_INDEX_CHAR); 6628 6629 movdqu(vec1, Address(str1, adr_stride)); 6630 pcmpestri(vec1, Address(str2, adr_stride), pcmpmask); 6631 jccb(Assembler::aboveEqual, COMPARE_WIDE_VECTORS); 6632 addl(cnt1, stride); 6633 6634 // Compare the characters at index in cnt1 6635 bind(COMPARE_INDEX_CHAR); //cnt1 has the offset of the mismatching character 6636 load_unsigned_short(result, Address(str1, cnt1, scale)); 6637 load_unsigned_short(cnt2, Address(str2, cnt1, scale)); 6638 subl(result, cnt2); 6639 jmp(POP_LABEL); 6640 6641 // Setup the registers to start vector comparison loop 6642 bind(COMPARE_WIDE_VECTORS); 6643 lea(str1, Address(str1, result, scale)); 6644 lea(str2, Address(str2, result, scale)); 6645 subl(result, stride2); 6646 subl(cnt2, stride2); 6647 jccb(Assembler::zero, COMPARE_WIDE_TAIL); 6648 negptr(result); 6649 6650 // In a loop, compare 16-chars (32-bytes) at once using (vpxor+vptest) 6651 bind(COMPARE_WIDE_VECTORS_LOOP); 6652 vmovdqu(vec1, Address(str1, result, scale)); 6653 vpxor(vec1, Address(str2, result, scale)); 6654 vptest(vec1, vec1); 6655 jccb(Assembler::notZero, VECTOR_NOT_EQUAL); 6656 addptr(result, stride2); 6657 subl(cnt2, stride2); 6658 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS_LOOP); 6659 // clean upper bits of YMM registers 6660 vpxor(vec1, vec1); 6661 6662 // compare wide vectors tail 6663 bind(COMPARE_WIDE_TAIL); 6664 testptr(result, result); 6665 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 6666 6667 movl(result, stride2); 6668 movl(cnt2, result); 6669 negptr(result); 6670 jmpb(COMPARE_WIDE_VECTORS_LOOP); 6671 6672 // Identifies the mismatching (higher or lower)16-bytes in the 32-byte vectors. 6673 bind(VECTOR_NOT_EQUAL); 6674 // clean upper bits of YMM registers 6675 vpxor(vec1, vec1); 6676 lea(str1, Address(str1, result, scale)); 6677 lea(str2, Address(str2, result, scale)); 6678 jmp(COMPARE_16_CHARS); 6679 6680 // Compare tail chars, length between 1 to 15 chars 6681 bind(COMPARE_TAIL_LONG); 6682 movl(cnt2, result); 6683 cmpl(cnt2, stride); 6684 jccb(Assembler::less, COMPARE_SMALL_STR); 6685 6686 movdqu(vec1, Address(str1, 0)); 6687 pcmpestri(vec1, Address(str2, 0), pcmpmask); 6688 jcc(Assembler::below, COMPARE_INDEX_CHAR); 6689 subptr(cnt2, stride); 6690 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 6691 lea(str1, Address(str1, result, scale)); 6692 lea(str2, Address(str2, result, scale)); 6693 negptr(cnt2); 6694 jmpb(WHILE_HEAD_LABEL); 6695 6696 bind(COMPARE_SMALL_STR); 6697 } else if (UseSSE42Intrinsics) { 6698 Label COMPARE_WIDE_VECTORS, VECTOR_NOT_EQUAL, COMPARE_TAIL; 6699 int pcmpmask = 0x19; 6700 // Setup to compare 8-char (16-byte) vectors, 6701 // start from first character again because it has aligned address. 6702 movl(result, cnt2); 6703 andl(cnt2, ~(stride - 1)); // cnt2 holds the vector count 6704 jccb(Assembler::zero, COMPARE_TAIL); 6705 6706 lea(str1, Address(str1, result, scale)); 6707 lea(str2, Address(str2, result, scale)); 6708 negptr(result); 6709 6710 // pcmpestri 6711 // inputs: 6712 // vec1- substring 6713 // rax - negative string length (elements count) 6714 // mem - scanned string 6715 // rdx - string length (elements count) 6716 // pcmpmask - cmp mode: 11000 (string compare with negated result) 6717 // + 00 (unsigned bytes) or + 01 (unsigned shorts) 6718 // outputs: 6719 // rcx - first mismatched element index 6720 assert(result == rax && cnt2 == rdx && cnt1 == rcx, "pcmpestri"); 6721 6722 bind(COMPARE_WIDE_VECTORS); 6723 movdqu(vec1, Address(str1, result, scale)); 6724 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 6725 // After pcmpestri cnt1(rcx) contains mismatched element index 6726 6727 jccb(Assembler::below, VECTOR_NOT_EQUAL); // CF==1 6728 addptr(result, stride); 6729 subptr(cnt2, stride); 6730 jccb(Assembler::notZero, COMPARE_WIDE_VECTORS); 6731 6732 // compare wide vectors tail 6733 testptr(result, result); 6734 jccb(Assembler::zero, LENGTH_DIFF_LABEL); 6735 6736 movl(cnt2, stride); 6737 movl(result, stride); 6738 negptr(result); 6739 movdqu(vec1, Address(str1, result, scale)); 6740 pcmpestri(vec1, Address(str2, result, scale), pcmpmask); 6741 jccb(Assembler::aboveEqual, LENGTH_DIFF_LABEL); 6742 6743 // Mismatched characters in the vectors 6744 bind(VECTOR_NOT_EQUAL); 6745 addptr(cnt1, result); 6746 load_unsigned_short(result, Address(str1, cnt1, scale)); 6747 load_unsigned_short(cnt2, Address(str2, cnt1, scale)); 6748 subl(result, cnt2); 6749 jmpb(POP_LABEL); 6750 6751 bind(COMPARE_TAIL); // limit is zero 6752 movl(cnt2, result); 6753 // Fallthru to tail compare 6754 } 6755 // Shift str2 and str1 to the end of the arrays, negate min 6756 lea(str1, Address(str1, cnt2, scale)); 6757 lea(str2, Address(str2, cnt2, scale)); 6758 decrementl(cnt2); // first character was compared already 6759 negptr(cnt2); 6760 6761 // Compare the rest of the elements 6762 bind(WHILE_HEAD_LABEL); 6763 load_unsigned_short(result, Address(str1, cnt2, scale, 0)); 6764 load_unsigned_short(cnt1, Address(str2, cnt2, scale, 0)); 6765 subl(result, cnt1); 6766 jccb(Assembler::notZero, POP_LABEL); 6767 increment(cnt2); 6768 jccb(Assembler::notZero, WHILE_HEAD_LABEL); 6769 6770 // Strings are equal up to min length. Return the length difference. 6771 bind(LENGTH_DIFF_LABEL); 6772 pop(result); 6773 jmpb(DONE_LABEL); 6774 6775 // Discard the stored length difference 6776 bind(POP_LABEL); 6777 pop(cnt1); 6778 6779 // That's it 6780 bind(DONE_LABEL); 6781 } 6782 6783 // Compare char[] arrays aligned to 4 bytes or substrings. 6784 void MacroAssembler::char_arrays_equals(bool is_array_equ, Register ary1, Register ary2, 6785 Register limit, Register result, Register chr, 6786 XMMRegister vec1, XMMRegister vec2) { 6787 ShortBranchVerifier sbv(this); 6788 Label TRUE_LABEL, FALSE_LABEL, DONE, COMPARE_VECTORS, COMPARE_CHAR; 6789 6790 int length_offset = arrayOopDesc::length_offset_in_bytes(); 6791 int base_offset = arrayOopDesc::base_offset_in_bytes(T_CHAR); 6792 6793 // Check the input args 6794 cmpptr(ary1, ary2); 6795 jcc(Assembler::equal, TRUE_LABEL); 6796 6797 if (is_array_equ) { 6798 // Need additional checks for arrays_equals. 6799 testptr(ary1, ary1); 6800 jcc(Assembler::zero, FALSE_LABEL); 6801 testptr(ary2, ary2); 6802 jcc(Assembler::zero, FALSE_LABEL); 6803 6804 // Check the lengths 6805 movl(limit, Address(ary1, length_offset)); 6806 cmpl(limit, Address(ary2, length_offset)); 6807 jcc(Assembler::notEqual, FALSE_LABEL); 6808 } 6809 6810 // count == 0 6811 testl(limit, limit); 6812 jcc(Assembler::zero, TRUE_LABEL); 6813 6814 if (is_array_equ) { 6815 // Load array address 6816 lea(ary1, Address(ary1, base_offset)); 6817 lea(ary2, Address(ary2, base_offset)); 6818 } 6819 6820 shll(limit, 1); // byte count != 0 6821 movl(result, limit); // copy 6822 6823 if (UseAVX >= 2) { 6824 // With AVX2, use 32-byte vector compare 6825 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 6826 6827 // Compare 32-byte vectors 6828 andl(result, 0x0000001e); // tail count (in bytes) 6829 andl(limit, 0xffffffe0); // vector count (in bytes) 6830 jccb(Assembler::zero, COMPARE_TAIL); 6831 6832 lea(ary1, Address(ary1, limit, Address::times_1)); 6833 lea(ary2, Address(ary2, limit, Address::times_1)); 6834 negptr(limit); 6835 6836 bind(COMPARE_WIDE_VECTORS); 6837 vmovdqu(vec1, Address(ary1, limit, Address::times_1)); 6838 vmovdqu(vec2, Address(ary2, limit, Address::times_1)); 6839 vpxor(vec1, vec2); 6840 6841 vptest(vec1, vec1); 6842 jccb(Assembler::notZero, FALSE_LABEL); 6843 addptr(limit, 32); 6844 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 6845 6846 testl(result, result); 6847 jccb(Assembler::zero, TRUE_LABEL); 6848 6849 vmovdqu(vec1, Address(ary1, result, Address::times_1, -32)); 6850 vmovdqu(vec2, Address(ary2, result, Address::times_1, -32)); 6851 vpxor(vec1, vec2); 6852 6853 vptest(vec1, vec1); 6854 jccb(Assembler::notZero, FALSE_LABEL); 6855 jmpb(TRUE_LABEL); 6856 6857 bind(COMPARE_TAIL); // limit is zero 6858 movl(limit, result); 6859 // Fallthru to tail compare 6860 } else if (UseSSE42Intrinsics) { 6861 // With SSE4.2, use double quad vector compare 6862 Label COMPARE_WIDE_VECTORS, COMPARE_TAIL; 6863 6864 // Compare 16-byte vectors 6865 andl(result, 0x0000000e); // tail count (in bytes) 6866 andl(limit, 0xfffffff0); // vector count (in bytes) 6867 jccb(Assembler::zero, COMPARE_TAIL); 6868 6869 lea(ary1, Address(ary1, limit, Address::times_1)); 6870 lea(ary2, Address(ary2, limit, Address::times_1)); 6871 negptr(limit); 6872 6873 bind(COMPARE_WIDE_VECTORS); 6874 movdqu(vec1, Address(ary1, limit, Address::times_1)); 6875 movdqu(vec2, Address(ary2, limit, Address::times_1)); 6876 pxor(vec1, vec2); 6877 6878 ptest(vec1, vec1); 6879 jccb(Assembler::notZero, FALSE_LABEL); 6880 addptr(limit, 16); 6881 jcc(Assembler::notZero, COMPARE_WIDE_VECTORS); 6882 6883 testl(result, result); 6884 jccb(Assembler::zero, TRUE_LABEL); 6885 6886 movdqu(vec1, Address(ary1, result, Address::times_1, -16)); 6887 movdqu(vec2, Address(ary2, result, Address::times_1, -16)); 6888 pxor(vec1, vec2); 6889 6890 ptest(vec1, vec1); 6891 jccb(Assembler::notZero, FALSE_LABEL); 6892 jmpb(TRUE_LABEL); 6893 6894 bind(COMPARE_TAIL); // limit is zero 6895 movl(limit, result); 6896 // Fallthru to tail compare 6897 } 6898 6899 // Compare 4-byte vectors 6900 andl(limit, 0xfffffffc); // vector count (in bytes) 6901 jccb(Assembler::zero, COMPARE_CHAR); 6902 6903 lea(ary1, Address(ary1, limit, Address::times_1)); 6904 lea(ary2, Address(ary2, limit, Address::times_1)); 6905 negptr(limit); 6906 6907 bind(COMPARE_VECTORS); 6908 movl(chr, Address(ary1, limit, Address::times_1)); 6909 cmpl(chr, Address(ary2, limit, Address::times_1)); 6910 jccb(Assembler::notEqual, FALSE_LABEL); 6911 addptr(limit, 4); 6912 jcc(Assembler::notZero, COMPARE_VECTORS); 6913 6914 // Compare trailing char (final 2 bytes), if any 6915 bind(COMPARE_CHAR); 6916 testl(result, 0x2); // tail char 6917 jccb(Assembler::zero, TRUE_LABEL); 6918 load_unsigned_short(chr, Address(ary1, 0)); 6919 load_unsigned_short(limit, Address(ary2, 0)); 6920 cmpl(chr, limit); 6921 jccb(Assembler::notEqual, FALSE_LABEL); 6922 6923 bind(TRUE_LABEL); 6924 movl(result, 1); // return true 6925 jmpb(DONE); 6926 6927 bind(FALSE_LABEL); 6928 xorl(result, result); // return false 6929 6930 // That's it 6931 bind(DONE); 6932 if (UseAVX >= 2) { 6933 // clean upper bits of YMM registers 6934 vpxor(vec1, vec1); 6935 vpxor(vec2, vec2); 6936 } 6937 } 6938 6939 void MacroAssembler::generate_fill(BasicType t, bool aligned, 6940 Register to, Register value, Register count, 6941 Register rtmp, XMMRegister xtmp) { 6942 ShortBranchVerifier sbv(this); 6943 assert_different_registers(to, value, count, rtmp); 6944 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 6945 Label L_fill_2_bytes, L_fill_4_bytes; 6946 6947 int shift = -1; 6948 switch (t) { 6949 case T_BYTE: 6950 shift = 2; 6951 break; 6952 case T_SHORT: 6953 shift = 1; 6954 break; 6955 case T_INT: 6956 shift = 0; 6957 break; 6958 default: ShouldNotReachHere(); 6959 } 6960 6961 if (t == T_BYTE) { 6962 andl(value, 0xff); 6963 movl(rtmp, value); 6964 shll(rtmp, 8); 6965 orl(value, rtmp); 6966 } 6967 if (t == T_SHORT) { 6968 andl(value, 0xffff); 6969 } 6970 if (t == T_BYTE || t == T_SHORT) { 6971 movl(rtmp, value); 6972 shll(rtmp, 16); 6973 orl(value, rtmp); 6974 } 6975 6976 cmpl(count, 2<<shift); // Short arrays (< 8 bytes) fill by element 6977 jcc(Assembler::below, L_fill_4_bytes); // use unsigned cmp 6978 if (!UseUnalignedLoadStores && !aligned && (t == T_BYTE || t == T_SHORT)) { 6979 // align source address at 4 bytes address boundary 6980 if (t == T_BYTE) { 6981 // One byte misalignment happens only for byte arrays 6982 testptr(to, 1); 6983 jccb(Assembler::zero, L_skip_align1); 6984 movb(Address(to, 0), value); 6985 increment(to); 6986 decrement(count); 6987 BIND(L_skip_align1); 6988 } 6989 // Two bytes misalignment happens only for byte and short (char) arrays 6990 testptr(to, 2); 6991 jccb(Assembler::zero, L_skip_align2); 6992 movw(Address(to, 0), value); 6993 addptr(to, 2); 6994 subl(count, 1<<(shift-1)); 6995 BIND(L_skip_align2); 6996 } 6997 if (UseSSE < 2) { 6998 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 6999 // Fill 32-byte chunks 7000 subl(count, 8 << shift); 7001 jcc(Assembler::less, L_check_fill_8_bytes); 7002 align(16); 7003 7004 BIND(L_fill_32_bytes_loop); 7005 7006 for (int i = 0; i < 32; i += 4) { 7007 movl(Address(to, i), value); 7008 } 7009 7010 addptr(to, 32); 7011 subl(count, 8 << shift); 7012 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 7013 BIND(L_check_fill_8_bytes); 7014 addl(count, 8 << shift); 7015 jccb(Assembler::zero, L_exit); 7016 jmpb(L_fill_8_bytes); 7017 7018 // 7019 // length is too short, just fill qwords 7020 // 7021 BIND(L_fill_8_bytes_loop); 7022 movl(Address(to, 0), value); 7023 movl(Address(to, 4), value); 7024 addptr(to, 8); 7025 BIND(L_fill_8_bytes); 7026 subl(count, 1 << (shift + 1)); 7027 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 7028 // fall through to fill 4 bytes 7029 } else { 7030 Label L_fill_32_bytes; 7031 if (!UseUnalignedLoadStores) { 7032 // align to 8 bytes, we know we are 4 byte aligned to start 7033 testptr(to, 4); 7034 jccb(Assembler::zero, L_fill_32_bytes); 7035 movl(Address(to, 0), value); 7036 addptr(to, 4); 7037 subl(count, 1<<shift); 7038 } 7039 BIND(L_fill_32_bytes); 7040 { 7041 assert( UseSSE >= 2, "supported cpu only" ); 7042 Label L_fill_32_bytes_loop, L_check_fill_8_bytes, L_fill_8_bytes_loop, L_fill_8_bytes; 7043 if (UseAVX > 2) { 7044 movl(rtmp, 0xffff); 7045 #ifdef _LP64 7046 kmovql(k1, rtmp); 7047 #else 7048 kmovdl(k1, rtmp); 7049 #endif 7050 } 7051 movdl(xtmp, value); 7052 if (UseAVX > 2 && UseUnalignedLoadStores) { 7053 // Fill 64-byte chunks 7054 Label L_fill_64_bytes_loop, L_check_fill_32_bytes; 7055 evpbroadcastd(xtmp, xtmp, Assembler::AVX_512bit); 7056 7057 subl(count, 16 << shift); 7058 jcc(Assembler::less, L_check_fill_32_bytes); 7059 align(16); 7060 7061 BIND(L_fill_64_bytes_loop); 7062 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_512bit); 7063 addptr(to, 64); 7064 subl(count, 16 << shift); 7065 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 7066 7067 BIND(L_check_fill_32_bytes); 7068 addl(count, 8 << shift); 7069 jccb(Assembler::less, L_check_fill_8_bytes); 7070 evmovdqu(Address(to, 0), xtmp, Assembler::AVX_256bit); 7071 addptr(to, 32); 7072 subl(count, 8 << shift); 7073 7074 BIND(L_check_fill_8_bytes); 7075 } else if (UseAVX == 2 && UseUnalignedLoadStores) { 7076 // Fill 64-byte chunks 7077 Label L_fill_64_bytes_loop, L_check_fill_32_bytes; 7078 vpbroadcastd(xtmp, xtmp); 7079 7080 subl(count, 16 << shift); 7081 jcc(Assembler::less, L_check_fill_32_bytes); 7082 align(16); 7083 7084 BIND(L_fill_64_bytes_loop); 7085 vmovdqu(Address(to, 0), xtmp); 7086 vmovdqu(Address(to, 32), xtmp); 7087 addptr(to, 64); 7088 subl(count, 16 << shift); 7089 jcc(Assembler::greaterEqual, L_fill_64_bytes_loop); 7090 7091 BIND(L_check_fill_32_bytes); 7092 addl(count, 8 << shift); 7093 jccb(Assembler::less, L_check_fill_8_bytes); 7094 vmovdqu(Address(to, 0), xtmp); 7095 addptr(to, 32); 7096 subl(count, 8 << shift); 7097 7098 BIND(L_check_fill_8_bytes); 7099 // clean upper bits of YMM registers 7100 movdl(xtmp, value); 7101 pshufd(xtmp, xtmp, 0); 7102 } else { 7103 // Fill 32-byte chunks 7104 pshufd(xtmp, xtmp, 0); 7105 7106 subl(count, 8 << shift); 7107 jcc(Assembler::less, L_check_fill_8_bytes); 7108 align(16); 7109 7110 BIND(L_fill_32_bytes_loop); 7111 7112 if (UseUnalignedLoadStores) { 7113 movdqu(Address(to, 0), xtmp); 7114 movdqu(Address(to, 16), xtmp); 7115 } else { 7116 movq(Address(to, 0), xtmp); 7117 movq(Address(to, 8), xtmp); 7118 movq(Address(to, 16), xtmp); 7119 movq(Address(to, 24), xtmp); 7120 } 7121 7122 addptr(to, 32); 7123 subl(count, 8 << shift); 7124 jcc(Assembler::greaterEqual, L_fill_32_bytes_loop); 7125 7126 BIND(L_check_fill_8_bytes); 7127 } 7128 addl(count, 8 << shift); 7129 jccb(Assembler::zero, L_exit); 7130 jmpb(L_fill_8_bytes); 7131 7132 // 7133 // length is too short, just fill qwords 7134 // 7135 BIND(L_fill_8_bytes_loop); 7136 movq(Address(to, 0), xtmp); 7137 addptr(to, 8); 7138 BIND(L_fill_8_bytes); 7139 subl(count, 1 << (shift + 1)); 7140 jcc(Assembler::greaterEqual, L_fill_8_bytes_loop); 7141 } 7142 } 7143 // fill trailing 4 bytes 7144 BIND(L_fill_4_bytes); 7145 testl(count, 1<<shift); 7146 jccb(Assembler::zero, L_fill_2_bytes); 7147 movl(Address(to, 0), value); 7148 if (t == T_BYTE || t == T_SHORT) { 7149 addptr(to, 4); 7150 BIND(L_fill_2_bytes); 7151 // fill trailing 2 bytes 7152 testl(count, 1<<(shift-1)); 7153 jccb(Assembler::zero, L_fill_byte); 7154 movw(Address(to, 0), value); 7155 if (t == T_BYTE) { 7156 addptr(to, 2); 7157 BIND(L_fill_byte); 7158 // fill trailing byte 7159 testl(count, 1); 7160 jccb(Assembler::zero, L_exit); 7161 movb(Address(to, 0), value); 7162 } else { 7163 BIND(L_fill_byte); 7164 } 7165 } else { 7166 BIND(L_fill_2_bytes); 7167 } 7168 BIND(L_exit); 7169 } 7170 7171 // encode char[] to byte[] in ISO_8859_1 7172 void MacroAssembler::encode_iso_array(Register src, Register dst, Register len, 7173 XMMRegister tmp1Reg, XMMRegister tmp2Reg, 7174 XMMRegister tmp3Reg, XMMRegister tmp4Reg, 7175 Register tmp5, Register result) { 7176 // rsi: src 7177 // rdi: dst 7178 // rdx: len 7179 // rcx: tmp5 7180 // rax: result 7181 ShortBranchVerifier sbv(this); 7182 assert_different_registers(src, dst, len, tmp5, result); 7183 Label L_done, L_copy_1_char, L_copy_1_char_exit; 7184 7185 // set result 7186 xorl(result, result); 7187 // check for zero length 7188 testl(len, len); 7189 jcc(Assembler::zero, L_done); 7190 movl(result, len); 7191 7192 // Setup pointers 7193 lea(src, Address(src, len, Address::times_2)); // char[] 7194 lea(dst, Address(dst, len, Address::times_1)); // byte[] 7195 negptr(len); 7196 7197 if (UseSSE42Intrinsics || UseAVX >= 2) { 7198 Label L_chars_8_check, L_copy_8_chars, L_copy_8_chars_exit; 7199 Label L_chars_16_check, L_copy_16_chars, L_copy_16_chars_exit; 7200 7201 if (UseAVX >= 2) { 7202 Label L_chars_32_check, L_copy_32_chars, L_copy_32_chars_exit; 7203 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector 7204 movdl(tmp1Reg, tmp5); 7205 vpbroadcastd(tmp1Reg, tmp1Reg); 7206 jmpb(L_chars_32_check); 7207 7208 bind(L_copy_32_chars); 7209 vmovdqu(tmp3Reg, Address(src, len, Address::times_2, -64)); 7210 vmovdqu(tmp4Reg, Address(src, len, Address::times_2, -32)); 7211 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7212 vptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 7213 jccb(Assembler::notZero, L_copy_32_chars_exit); 7214 vpackuswb(tmp3Reg, tmp3Reg, tmp4Reg, /* vector_len */ 1); 7215 vpermq(tmp4Reg, tmp3Reg, 0xD8, /* vector_len */ 1); 7216 vmovdqu(Address(dst, len, Address::times_1, -32), tmp4Reg); 7217 7218 bind(L_chars_32_check); 7219 addptr(len, 32); 7220 jccb(Assembler::lessEqual, L_copy_32_chars); 7221 7222 bind(L_copy_32_chars_exit); 7223 subptr(len, 16); 7224 jccb(Assembler::greater, L_copy_16_chars_exit); 7225 7226 } else if (UseSSE42Intrinsics) { 7227 movl(tmp5, 0xff00ff00); // create mask to test for Unicode chars in vector 7228 movdl(tmp1Reg, tmp5); 7229 pshufd(tmp1Reg, tmp1Reg, 0); 7230 jmpb(L_chars_16_check); 7231 } 7232 7233 bind(L_copy_16_chars); 7234 if (UseAVX >= 2) { 7235 vmovdqu(tmp2Reg, Address(src, len, Address::times_2, -32)); 7236 vptest(tmp2Reg, tmp1Reg); 7237 jccb(Assembler::notZero, L_copy_16_chars_exit); 7238 vpackuswb(tmp2Reg, tmp2Reg, tmp1Reg, /* vector_len */ 1); 7239 vpermq(tmp3Reg, tmp2Reg, 0xD8, /* vector_len */ 1); 7240 } else { 7241 if (UseAVX > 0) { 7242 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7243 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7244 vpor(tmp2Reg, tmp3Reg, tmp4Reg, /* vector_len */ 0); 7245 } else { 7246 movdqu(tmp3Reg, Address(src, len, Address::times_2, -32)); 7247 por(tmp2Reg, tmp3Reg); 7248 movdqu(tmp4Reg, Address(src, len, Address::times_2, -16)); 7249 por(tmp2Reg, tmp4Reg); 7250 } 7251 ptest(tmp2Reg, tmp1Reg); // check for Unicode chars in vector 7252 jccb(Assembler::notZero, L_copy_16_chars_exit); 7253 packuswb(tmp3Reg, tmp4Reg); 7254 } 7255 movdqu(Address(dst, len, Address::times_1, -16), tmp3Reg); 7256 7257 bind(L_chars_16_check); 7258 addptr(len, 16); 7259 jccb(Assembler::lessEqual, L_copy_16_chars); 7260 7261 bind(L_copy_16_chars_exit); 7262 if (UseAVX >= 2) { 7263 // clean upper bits of YMM registers 7264 vpxor(tmp2Reg, tmp2Reg); 7265 vpxor(tmp3Reg, tmp3Reg); 7266 vpxor(tmp4Reg, tmp4Reg); 7267 movdl(tmp1Reg, tmp5); 7268 pshufd(tmp1Reg, tmp1Reg, 0); 7269 } 7270 subptr(len, 8); 7271 jccb(Assembler::greater, L_copy_8_chars_exit); 7272 7273 bind(L_copy_8_chars); 7274 movdqu(tmp3Reg, Address(src, len, Address::times_2, -16)); 7275 ptest(tmp3Reg, tmp1Reg); 7276 jccb(Assembler::notZero, L_copy_8_chars_exit); 7277 packuswb(tmp3Reg, tmp1Reg); 7278 movq(Address(dst, len, Address::times_1, -8), tmp3Reg); 7279 addptr(len, 8); 7280 jccb(Assembler::lessEqual, L_copy_8_chars); 7281 7282 bind(L_copy_8_chars_exit); 7283 subptr(len, 8); 7284 jccb(Assembler::zero, L_done); 7285 } 7286 7287 bind(L_copy_1_char); 7288 load_unsigned_short(tmp5, Address(src, len, Address::times_2, 0)); 7289 testl(tmp5, 0xff00); // check if Unicode char 7290 jccb(Assembler::notZero, L_copy_1_char_exit); 7291 movb(Address(dst, len, Address::times_1, 0), tmp5); 7292 addptr(len, 1); 7293 jccb(Assembler::less, L_copy_1_char); 7294 7295 bind(L_copy_1_char_exit); 7296 addptr(result, len); // len is negative count of not processed elements 7297 bind(L_done); 7298 } 7299 7300 #ifdef _LP64 7301 /** 7302 * Helper for multiply_to_len(). 7303 */ 7304 void MacroAssembler::add2_with_carry(Register dest_hi, Register dest_lo, Register src1, Register src2) { 7305 addq(dest_lo, src1); 7306 adcq(dest_hi, 0); 7307 addq(dest_lo, src2); 7308 adcq(dest_hi, 0); 7309 } 7310 7311 /** 7312 * Multiply 64 bit by 64 bit first loop. 7313 */ 7314 void MacroAssembler::multiply_64_x_64_loop(Register x, Register xstart, Register x_xstart, 7315 Register y, Register y_idx, Register z, 7316 Register carry, Register product, 7317 Register idx, Register kdx) { 7318 // 7319 // jlong carry, x[], y[], z[]; 7320 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7321 // huge_128 product = y[idx] * x[xstart] + carry; 7322 // z[kdx] = (jlong)product; 7323 // carry = (jlong)(product >>> 64); 7324 // } 7325 // z[xstart] = carry; 7326 // 7327 7328 Label L_first_loop, L_first_loop_exit; 7329 Label L_one_x, L_one_y, L_multiply; 7330 7331 decrementl(xstart); 7332 jcc(Assembler::negative, L_one_x); 7333 7334 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7335 rorq(x_xstart, 32); // convert big-endian to little-endian 7336 7337 bind(L_first_loop); 7338 decrementl(idx); 7339 jcc(Assembler::negative, L_first_loop_exit); 7340 decrementl(idx); 7341 jcc(Assembler::negative, L_one_y); 7342 movq(y_idx, Address(y, idx, Address::times_4, 0)); 7343 rorq(y_idx, 32); // convert big-endian to little-endian 7344 bind(L_multiply); 7345 movq(product, x_xstart); 7346 mulq(y_idx); // product(rax) * y_idx -> rdx:rax 7347 addq(product, carry); 7348 adcq(rdx, 0); 7349 subl(kdx, 2); 7350 movl(Address(z, kdx, Address::times_4, 4), product); 7351 shrq(product, 32); 7352 movl(Address(z, kdx, Address::times_4, 0), product); 7353 movq(carry, rdx); 7354 jmp(L_first_loop); 7355 7356 bind(L_one_y); 7357 movl(y_idx, Address(y, 0)); 7358 jmp(L_multiply); 7359 7360 bind(L_one_x); 7361 movl(x_xstart, Address(x, 0)); 7362 jmp(L_first_loop); 7363 7364 bind(L_first_loop_exit); 7365 } 7366 7367 /** 7368 * Multiply 64 bit by 64 bit and add 128 bit. 7369 */ 7370 void MacroAssembler::multiply_add_128_x_128(Register x_xstart, Register y, Register z, 7371 Register yz_idx, Register idx, 7372 Register carry, Register product, int offset) { 7373 // huge_128 product = (y[idx] * x_xstart) + z[kdx] + carry; 7374 // z[kdx] = (jlong)product; 7375 7376 movq(yz_idx, Address(y, idx, Address::times_4, offset)); 7377 rorq(yz_idx, 32); // convert big-endian to little-endian 7378 movq(product, x_xstart); 7379 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7380 movq(yz_idx, Address(z, idx, Address::times_4, offset)); 7381 rorq(yz_idx, 32); // convert big-endian to little-endian 7382 7383 add2_with_carry(rdx, product, carry, yz_idx); 7384 7385 movl(Address(z, idx, Address::times_4, offset+4), product); 7386 shrq(product, 32); 7387 movl(Address(z, idx, Address::times_4, offset), product); 7388 7389 } 7390 7391 /** 7392 * Multiply 128 bit by 128 bit. Unrolled inner loop. 7393 */ 7394 void MacroAssembler::multiply_128_x_128_loop(Register x_xstart, Register y, Register z, 7395 Register yz_idx, Register idx, Register jdx, 7396 Register carry, Register product, 7397 Register carry2) { 7398 // jlong carry, x[], y[], z[]; 7399 // int kdx = ystart+1; 7400 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7401 // huge_128 product = (y[idx+1] * x_xstart) + z[kdx+idx+1] + carry; 7402 // z[kdx+idx+1] = (jlong)product; 7403 // jlong carry2 = (jlong)(product >>> 64); 7404 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry2; 7405 // z[kdx+idx] = (jlong)product; 7406 // carry = (jlong)(product >>> 64); 7407 // } 7408 // idx += 2; 7409 // if (idx > 0) { 7410 // product = (y[idx] * x_xstart) + z[kdx+idx] + carry; 7411 // z[kdx+idx] = (jlong)product; 7412 // carry = (jlong)(product >>> 64); 7413 // } 7414 // 7415 7416 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7417 7418 movl(jdx, idx); 7419 andl(jdx, 0xFFFFFFFC); 7420 shrl(jdx, 2); 7421 7422 bind(L_third_loop); 7423 subl(jdx, 1); 7424 jcc(Assembler::negative, L_third_loop_exit); 7425 subl(idx, 4); 7426 7427 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 8); 7428 movq(carry2, rdx); 7429 7430 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry2, product, 0); 7431 movq(carry, rdx); 7432 jmp(L_third_loop); 7433 7434 bind (L_third_loop_exit); 7435 7436 andl (idx, 0x3); 7437 jcc(Assembler::zero, L_post_third_loop_done); 7438 7439 Label L_check_1; 7440 subl(idx, 2); 7441 jcc(Assembler::negative, L_check_1); 7442 7443 multiply_add_128_x_128(x_xstart, y, z, yz_idx, idx, carry, product, 0); 7444 movq(carry, rdx); 7445 7446 bind (L_check_1); 7447 addl (idx, 0x2); 7448 andl (idx, 0x1); 7449 subl(idx, 1); 7450 jcc(Assembler::negative, L_post_third_loop_done); 7451 7452 movl(yz_idx, Address(y, idx, Address::times_4, 0)); 7453 movq(product, x_xstart); 7454 mulq(yz_idx); // product(rax) * yz_idx -> rdx:product(rax) 7455 movl(yz_idx, Address(z, idx, Address::times_4, 0)); 7456 7457 add2_with_carry(rdx, product, yz_idx, carry); 7458 7459 movl(Address(z, idx, Address::times_4, 0), product); 7460 shrq(product, 32); 7461 7462 shlq(rdx, 32); 7463 orq(product, rdx); 7464 movq(carry, product); 7465 7466 bind(L_post_third_loop_done); 7467 } 7468 7469 /** 7470 * Multiply 128 bit by 128 bit using BMI2. Unrolled inner loop. 7471 * 7472 */ 7473 void MacroAssembler::multiply_128_x_128_bmi2_loop(Register y, Register z, 7474 Register carry, Register carry2, 7475 Register idx, Register jdx, 7476 Register yz_idx1, Register yz_idx2, 7477 Register tmp, Register tmp3, Register tmp4) { 7478 assert(UseBMI2Instructions, "should be used only when BMI2 is available"); 7479 7480 // jlong carry, x[], y[], z[]; 7481 // int kdx = ystart+1; 7482 // for (int idx=ystart-2; idx >= 0; idx -= 2) { // Third loop 7483 // huge_128 tmp3 = (y[idx+1] * rdx) + z[kdx+idx+1] + carry; 7484 // jlong carry2 = (jlong)(tmp3 >>> 64); 7485 // huge_128 tmp4 = (y[idx] * rdx) + z[kdx+idx] + carry2; 7486 // carry = (jlong)(tmp4 >>> 64); 7487 // z[kdx+idx+1] = (jlong)tmp3; 7488 // z[kdx+idx] = (jlong)tmp4; 7489 // } 7490 // idx += 2; 7491 // if (idx > 0) { 7492 // yz_idx1 = (y[idx] * rdx) + z[kdx+idx] + carry; 7493 // z[kdx+idx] = (jlong)yz_idx1; 7494 // carry = (jlong)(yz_idx1 >>> 64); 7495 // } 7496 // 7497 7498 Label L_third_loop, L_third_loop_exit, L_post_third_loop_done; 7499 7500 movl(jdx, idx); 7501 andl(jdx, 0xFFFFFFFC); 7502 shrl(jdx, 2); 7503 7504 bind(L_third_loop); 7505 subl(jdx, 1); 7506 jcc(Assembler::negative, L_third_loop_exit); 7507 subl(idx, 4); 7508 7509 movq(yz_idx1, Address(y, idx, Address::times_4, 8)); 7510 rorxq(yz_idx1, yz_idx1, 32); // convert big-endian to little-endian 7511 movq(yz_idx2, Address(y, idx, Address::times_4, 0)); 7512 rorxq(yz_idx2, yz_idx2, 32); 7513 7514 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7515 mulxq(carry2, tmp, yz_idx2); // yz_idx2 * rdx -> carry2:tmp 7516 7517 movq(yz_idx1, Address(z, idx, Address::times_4, 8)); 7518 rorxq(yz_idx1, yz_idx1, 32); 7519 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7520 rorxq(yz_idx2, yz_idx2, 32); 7521 7522 if (VM_Version::supports_adx()) { 7523 adcxq(tmp3, carry); 7524 adoxq(tmp3, yz_idx1); 7525 7526 adcxq(tmp4, tmp); 7527 adoxq(tmp4, yz_idx2); 7528 7529 movl(carry, 0); // does not affect flags 7530 adcxq(carry2, carry); 7531 adoxq(carry2, carry); 7532 } else { 7533 add2_with_carry(tmp4, tmp3, carry, yz_idx1); 7534 add2_with_carry(carry2, tmp4, tmp, yz_idx2); 7535 } 7536 movq(carry, carry2); 7537 7538 movl(Address(z, idx, Address::times_4, 12), tmp3); 7539 shrq(tmp3, 32); 7540 movl(Address(z, idx, Address::times_4, 8), tmp3); 7541 7542 movl(Address(z, idx, Address::times_4, 4), tmp4); 7543 shrq(tmp4, 32); 7544 movl(Address(z, idx, Address::times_4, 0), tmp4); 7545 7546 jmp(L_third_loop); 7547 7548 bind (L_third_loop_exit); 7549 7550 andl (idx, 0x3); 7551 jcc(Assembler::zero, L_post_third_loop_done); 7552 7553 Label L_check_1; 7554 subl(idx, 2); 7555 jcc(Assembler::negative, L_check_1); 7556 7557 movq(yz_idx1, Address(y, idx, Address::times_4, 0)); 7558 rorxq(yz_idx1, yz_idx1, 32); 7559 mulxq(tmp4, tmp3, yz_idx1); // yz_idx1 * rdx -> tmp4:tmp3 7560 movq(yz_idx2, Address(z, idx, Address::times_4, 0)); 7561 rorxq(yz_idx2, yz_idx2, 32); 7562 7563 add2_with_carry(tmp4, tmp3, carry, yz_idx2); 7564 7565 movl(Address(z, idx, Address::times_4, 4), tmp3); 7566 shrq(tmp3, 32); 7567 movl(Address(z, idx, Address::times_4, 0), tmp3); 7568 movq(carry, tmp4); 7569 7570 bind (L_check_1); 7571 addl (idx, 0x2); 7572 andl (idx, 0x1); 7573 subl(idx, 1); 7574 jcc(Assembler::negative, L_post_third_loop_done); 7575 movl(tmp4, Address(y, idx, Address::times_4, 0)); 7576 mulxq(carry2, tmp3, tmp4); // tmp4 * rdx -> carry2:tmp3 7577 movl(tmp4, Address(z, idx, Address::times_4, 0)); 7578 7579 add2_with_carry(carry2, tmp3, tmp4, carry); 7580 7581 movl(Address(z, idx, Address::times_4, 0), tmp3); 7582 shrq(tmp3, 32); 7583 7584 shlq(carry2, 32); 7585 orq(tmp3, carry2); 7586 movq(carry, tmp3); 7587 7588 bind(L_post_third_loop_done); 7589 } 7590 7591 /** 7592 * Code for BigInteger::multiplyToLen() instrinsic. 7593 * 7594 * rdi: x 7595 * rax: xlen 7596 * rsi: y 7597 * rcx: ylen 7598 * r8: z 7599 * r11: zlen 7600 * r12: tmp1 7601 * r13: tmp2 7602 * r14: tmp3 7603 * r15: tmp4 7604 * rbx: tmp5 7605 * 7606 */ 7607 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, Register z, Register zlen, 7608 Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) { 7609 ShortBranchVerifier sbv(this); 7610 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, rdx); 7611 7612 push(tmp1); 7613 push(tmp2); 7614 push(tmp3); 7615 push(tmp4); 7616 push(tmp5); 7617 7618 push(xlen); 7619 push(zlen); 7620 7621 const Register idx = tmp1; 7622 const Register kdx = tmp2; 7623 const Register xstart = tmp3; 7624 7625 const Register y_idx = tmp4; 7626 const Register carry = tmp5; 7627 const Register product = xlen; 7628 const Register x_xstart = zlen; // reuse register 7629 7630 // First Loop. 7631 // 7632 // final static long LONG_MASK = 0xffffffffL; 7633 // int xstart = xlen - 1; 7634 // int ystart = ylen - 1; 7635 // long carry = 0; 7636 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx-, kdx--) { 7637 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 7638 // z[kdx] = (int)product; 7639 // carry = product >>> 32; 7640 // } 7641 // z[xstart] = (int)carry; 7642 // 7643 7644 movl(idx, ylen); // idx = ylen; 7645 movl(kdx, zlen); // kdx = xlen+ylen; 7646 xorq(carry, carry); // carry = 0; 7647 7648 Label L_done; 7649 7650 movl(xstart, xlen); 7651 decrementl(xstart); 7652 jcc(Assembler::negative, L_done); 7653 7654 multiply_64_x_64_loop(x, xstart, x_xstart, y, y_idx, z, carry, product, idx, kdx); 7655 7656 Label L_second_loop; 7657 testl(kdx, kdx); 7658 jcc(Assembler::zero, L_second_loop); 7659 7660 Label L_carry; 7661 subl(kdx, 1); 7662 jcc(Assembler::zero, L_carry); 7663 7664 movl(Address(z, kdx, Address::times_4, 0), carry); 7665 shrq(carry, 32); 7666 subl(kdx, 1); 7667 7668 bind(L_carry); 7669 movl(Address(z, kdx, Address::times_4, 0), carry); 7670 7671 // Second and third (nested) loops. 7672 // 7673 // for (int i = xstart-1; i >= 0; i--) { // Second loop 7674 // carry = 0; 7675 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 7676 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 7677 // (z[k] & LONG_MASK) + carry; 7678 // z[k] = (int)product; 7679 // carry = product >>> 32; 7680 // } 7681 // z[i] = (int)carry; 7682 // } 7683 // 7684 // i = xlen, j = tmp1, k = tmp2, carry = tmp5, x[i] = rdx 7685 7686 const Register jdx = tmp1; 7687 7688 bind(L_second_loop); 7689 xorl(carry, carry); // carry = 0; 7690 movl(jdx, ylen); // j = ystart+1 7691 7692 subl(xstart, 1); // i = xstart-1; 7693 jcc(Assembler::negative, L_done); 7694 7695 push (z); 7696 7697 Label L_last_x; 7698 lea(z, Address(z, xstart, Address::times_4, 4)); // z = z + k - j 7699 subl(xstart, 1); // i = xstart-1; 7700 jcc(Assembler::negative, L_last_x); 7701 7702 if (UseBMI2Instructions) { 7703 movq(rdx, Address(x, xstart, Address::times_4, 0)); 7704 rorxq(rdx, rdx, 32); // convert big-endian to little-endian 7705 } else { 7706 movq(x_xstart, Address(x, xstart, Address::times_4, 0)); 7707 rorq(x_xstart, 32); // convert big-endian to little-endian 7708 } 7709 7710 Label L_third_loop_prologue; 7711 bind(L_third_loop_prologue); 7712 7713 push (x); 7714 push (xstart); 7715 push (ylen); 7716 7717 7718 if (UseBMI2Instructions) { 7719 multiply_128_x_128_bmi2_loop(y, z, carry, x, jdx, ylen, product, tmp2, x_xstart, tmp3, tmp4); 7720 } else { // !UseBMI2Instructions 7721 multiply_128_x_128_loop(x_xstart, y, z, y_idx, jdx, ylen, carry, product, x); 7722 } 7723 7724 pop(ylen); 7725 pop(xlen); 7726 pop(x); 7727 pop(z); 7728 7729 movl(tmp3, xlen); 7730 addl(tmp3, 1); 7731 movl(Address(z, tmp3, Address::times_4, 0), carry); 7732 subl(tmp3, 1); 7733 jccb(Assembler::negative, L_done); 7734 7735 shrq(carry, 32); 7736 movl(Address(z, tmp3, Address::times_4, 0), carry); 7737 jmp(L_second_loop); 7738 7739 // Next infrequent code is moved outside loops. 7740 bind(L_last_x); 7741 if (UseBMI2Instructions) { 7742 movl(rdx, Address(x, 0)); 7743 } else { 7744 movl(x_xstart, Address(x, 0)); 7745 } 7746 jmp(L_third_loop_prologue); 7747 7748 bind(L_done); 7749 7750 pop(zlen); 7751 pop(xlen); 7752 7753 pop(tmp5); 7754 pop(tmp4); 7755 pop(tmp3); 7756 pop(tmp2); 7757 pop(tmp1); 7758 } 7759 7760 //Helper functions for square_to_len() 7761 7762 /** 7763 * Store the squares of x[], right shifted one bit (divided by 2) into z[] 7764 * Preserves x and z and modifies rest of the registers. 7765 */ 7766 7767 void MacroAssembler::square_rshift(Register x, Register xlen, Register z, Register tmp1, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7768 // Perform square and right shift by 1 7769 // Handle odd xlen case first, then for even xlen do the following 7770 // jlong carry = 0; 7771 // for (int j=0, i=0; j < xlen; j+=2, i+=4) { 7772 // huge_128 product = x[j:j+1] * x[j:j+1]; 7773 // z[i:i+1] = (carry << 63) | (jlong)(product >>> 65); 7774 // z[i+2:i+3] = (jlong)(product >>> 1); 7775 // carry = (jlong)product; 7776 // } 7777 7778 xorq(tmp5, tmp5); // carry 7779 xorq(rdxReg, rdxReg); 7780 xorl(tmp1, tmp1); // index for x 7781 xorl(tmp4, tmp4); // index for z 7782 7783 Label L_first_loop, L_first_loop_exit; 7784 7785 testl(xlen, 1); 7786 jccb(Assembler::zero, L_first_loop); //jump if xlen is even 7787 7788 // Square and right shift by 1 the odd element using 32 bit multiply 7789 movl(raxReg, Address(x, tmp1, Address::times_4, 0)); 7790 imulq(raxReg, raxReg); 7791 shrq(raxReg, 1); 7792 adcq(tmp5, 0); 7793 movq(Address(z, tmp4, Address::times_4, 0), raxReg); 7794 incrementl(tmp1); 7795 addl(tmp4, 2); 7796 7797 // Square and right shift by 1 the rest using 64 bit multiply 7798 bind(L_first_loop); 7799 cmpptr(tmp1, xlen); 7800 jccb(Assembler::equal, L_first_loop_exit); 7801 7802 // Square 7803 movq(raxReg, Address(x, tmp1, Address::times_4, 0)); 7804 rorq(raxReg, 32); // convert big-endian to little-endian 7805 mulq(raxReg); // 64-bit multiply rax * rax -> rdx:rax 7806 7807 // Right shift by 1 and save carry 7808 shrq(tmp5, 1); // rdx:rax:tmp5 = (tmp5:rdx:rax) >>> 1 7809 rcrq(rdxReg, 1); 7810 rcrq(raxReg, 1); 7811 adcq(tmp5, 0); 7812 7813 // Store result in z 7814 movq(Address(z, tmp4, Address::times_4, 0), rdxReg); 7815 movq(Address(z, tmp4, Address::times_4, 8), raxReg); 7816 7817 // Update indices for x and z 7818 addl(tmp1, 2); 7819 addl(tmp4, 4); 7820 jmp(L_first_loop); 7821 7822 bind(L_first_loop_exit); 7823 } 7824 7825 7826 /** 7827 * Perform the following multiply add operation using BMI2 instructions 7828 * carry:sum = sum + op1*op2 + carry 7829 * op2 should be in rdx 7830 * op2 is preserved, all other registers are modified 7831 */ 7832 void MacroAssembler::multiply_add_64_bmi2(Register sum, Register op1, Register op2, Register carry, Register tmp2) { 7833 // assert op2 is rdx 7834 mulxq(tmp2, op1, op1); // op1 * op2 -> tmp2:op1 7835 addq(sum, carry); 7836 adcq(tmp2, 0); 7837 addq(sum, op1); 7838 adcq(tmp2, 0); 7839 movq(carry, tmp2); 7840 } 7841 7842 /** 7843 * Perform the following multiply add operation: 7844 * carry:sum = sum + op1*op2 + carry 7845 * Preserves op1, op2 and modifies rest of registers 7846 */ 7847 void MacroAssembler::multiply_add_64(Register sum, Register op1, Register op2, Register carry, Register rdxReg, Register raxReg) { 7848 // rdx:rax = op1 * op2 7849 movq(raxReg, op2); 7850 mulq(op1); 7851 7852 // rdx:rax = sum + carry + rdx:rax 7853 addq(sum, carry); 7854 adcq(rdxReg, 0); 7855 addq(sum, raxReg); 7856 adcq(rdxReg, 0); 7857 7858 // carry:sum = rdx:sum 7859 movq(carry, rdxReg); 7860 } 7861 7862 /** 7863 * Add 64 bit long carry into z[] with carry propogation. 7864 * Preserves z and carry register values and modifies rest of registers. 7865 * 7866 */ 7867 void MacroAssembler::add_one_64(Register z, Register zlen, Register carry, Register tmp1) { 7868 Label L_fourth_loop, L_fourth_loop_exit; 7869 7870 movl(tmp1, 1); 7871 subl(zlen, 2); 7872 addq(Address(z, zlen, Address::times_4, 0), carry); 7873 7874 bind(L_fourth_loop); 7875 jccb(Assembler::carryClear, L_fourth_loop_exit); 7876 subl(zlen, 2); 7877 jccb(Assembler::negative, L_fourth_loop_exit); 7878 addq(Address(z, zlen, Address::times_4, 0), tmp1); 7879 jmp(L_fourth_loop); 7880 bind(L_fourth_loop_exit); 7881 } 7882 7883 /** 7884 * Shift z[] left by 1 bit. 7885 * Preserves x, len, z and zlen registers and modifies rest of the registers. 7886 * 7887 */ 7888 void MacroAssembler::lshift_by_1(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4) { 7889 7890 Label L_fifth_loop, L_fifth_loop_exit; 7891 7892 // Fifth loop 7893 // Perform primitiveLeftShift(z, zlen, 1) 7894 7895 const Register prev_carry = tmp1; 7896 const Register new_carry = tmp4; 7897 const Register value = tmp2; 7898 const Register zidx = tmp3; 7899 7900 // int zidx, carry; 7901 // long value; 7902 // carry = 0; 7903 // for (zidx = zlen-2; zidx >=0; zidx -= 2) { 7904 // (carry:value) = (z[i] << 1) | carry ; 7905 // z[i] = value; 7906 // } 7907 7908 movl(zidx, zlen); 7909 xorl(prev_carry, prev_carry); // clear carry flag and prev_carry register 7910 7911 bind(L_fifth_loop); 7912 decl(zidx); // Use decl to preserve carry flag 7913 decl(zidx); 7914 jccb(Assembler::negative, L_fifth_loop_exit); 7915 7916 if (UseBMI2Instructions) { 7917 movq(value, Address(z, zidx, Address::times_4, 0)); 7918 rclq(value, 1); 7919 rorxq(value, value, 32); 7920 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7921 } 7922 else { 7923 // clear new_carry 7924 xorl(new_carry, new_carry); 7925 7926 // Shift z[i] by 1, or in previous carry and save new carry 7927 movq(value, Address(z, zidx, Address::times_4, 0)); 7928 shlq(value, 1); 7929 adcl(new_carry, 0); 7930 7931 orq(value, prev_carry); 7932 rorq(value, 0x20); 7933 movq(Address(z, zidx, Address::times_4, 0), value); // Store back in big endian form 7934 7935 // Set previous carry = new carry 7936 movl(prev_carry, new_carry); 7937 } 7938 jmp(L_fifth_loop); 7939 7940 bind(L_fifth_loop_exit); 7941 } 7942 7943 7944 /** 7945 * Code for BigInteger::squareToLen() intrinsic 7946 * 7947 * rdi: x 7948 * rsi: len 7949 * r8: z 7950 * rcx: zlen 7951 * r12: tmp1 7952 * r13: tmp2 7953 * r14: tmp3 7954 * r15: tmp4 7955 * rbx: tmp5 7956 * 7957 */ 7958 void MacroAssembler::square_to_len(Register x, Register len, Register z, Register zlen, Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 7959 7960 Label L_second_loop, L_second_loop_exit, L_third_loop, L_third_loop_exit, fifth_loop, fifth_loop_exit, L_last_x, L_multiply; 7961 push(tmp1); 7962 push(tmp2); 7963 push(tmp3); 7964 push(tmp4); 7965 push(tmp5); 7966 7967 // First loop 7968 // Store the squares, right shifted one bit (i.e., divided by 2). 7969 square_rshift(x, len, z, tmp1, tmp3, tmp4, tmp5, rdxReg, raxReg); 7970 7971 // Add in off-diagonal sums. 7972 // 7973 // Second, third (nested) and fourth loops. 7974 // zlen +=2; 7975 // for (int xidx=len-2,zidx=zlen-4; xidx > 0; xidx-=2,zidx-=4) { 7976 // carry = 0; 7977 // long op2 = x[xidx:xidx+1]; 7978 // for (int j=xidx-2,k=zidx; j >= 0; j-=2) { 7979 // k -= 2; 7980 // long op1 = x[j:j+1]; 7981 // long sum = z[k:k+1]; 7982 // carry:sum = multiply_add_64(sum, op1, op2, carry, tmp_regs); 7983 // z[k:k+1] = sum; 7984 // } 7985 // add_one_64(z, k, carry, tmp_regs); 7986 // } 7987 7988 const Register carry = tmp5; 7989 const Register sum = tmp3; 7990 const Register op1 = tmp4; 7991 Register op2 = tmp2; 7992 7993 push(zlen); 7994 push(len); 7995 addl(zlen,2); 7996 bind(L_second_loop); 7997 xorq(carry, carry); 7998 subl(zlen, 4); 7999 subl(len, 2); 8000 push(zlen); 8001 push(len); 8002 cmpl(len, 0); 8003 jccb(Assembler::lessEqual, L_second_loop_exit); 8004 8005 // Multiply an array by one 64 bit long. 8006 if (UseBMI2Instructions) { 8007 op2 = rdxReg; 8008 movq(op2, Address(x, len, Address::times_4, 0)); 8009 rorxq(op2, op2, 32); 8010 } 8011 else { 8012 movq(op2, Address(x, len, Address::times_4, 0)); 8013 rorq(op2, 32); 8014 } 8015 8016 bind(L_third_loop); 8017 decrementl(len); 8018 jccb(Assembler::negative, L_third_loop_exit); 8019 decrementl(len); 8020 jccb(Assembler::negative, L_last_x); 8021 8022 movq(op1, Address(x, len, Address::times_4, 0)); 8023 rorq(op1, 32); 8024 8025 bind(L_multiply); 8026 subl(zlen, 2); 8027 movq(sum, Address(z, zlen, Address::times_4, 0)); 8028 8029 // Multiply 64 bit by 64 bit and add 64 bits lower half and upper 64 bits as carry. 8030 if (UseBMI2Instructions) { 8031 multiply_add_64_bmi2(sum, op1, op2, carry, tmp2); 8032 } 8033 else { 8034 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8035 } 8036 8037 movq(Address(z, zlen, Address::times_4, 0), sum); 8038 8039 jmp(L_third_loop); 8040 bind(L_third_loop_exit); 8041 8042 // Fourth loop 8043 // Add 64 bit long carry into z with carry propogation. 8044 // Uses offsetted zlen. 8045 add_one_64(z, zlen, carry, tmp1); 8046 8047 pop(len); 8048 pop(zlen); 8049 jmp(L_second_loop); 8050 8051 // Next infrequent code is moved outside loops. 8052 bind(L_last_x); 8053 movl(op1, Address(x, 0)); 8054 jmp(L_multiply); 8055 8056 bind(L_second_loop_exit); 8057 pop(len); 8058 pop(zlen); 8059 pop(len); 8060 pop(zlen); 8061 8062 // Fifth loop 8063 // Shift z left 1 bit. 8064 lshift_by_1(x, len, z, zlen, tmp1, tmp2, tmp3, tmp4); 8065 8066 // z[zlen-1] |= x[len-1] & 1; 8067 movl(tmp3, Address(x, len, Address::times_4, -4)); 8068 andl(tmp3, 1); 8069 orl(Address(z, zlen, Address::times_4, -4), tmp3); 8070 8071 pop(tmp5); 8072 pop(tmp4); 8073 pop(tmp3); 8074 pop(tmp2); 8075 pop(tmp1); 8076 } 8077 8078 /** 8079 * Helper function for mul_add() 8080 * Multiply the in[] by int k and add to out[] starting at offset offs using 8081 * 128 bit by 32 bit multiply and return the carry in tmp5. 8082 * Only quad int aligned length of in[] is operated on in this function. 8083 * k is in rdxReg for BMI2Instructions, for others it is in tmp2. 8084 * This function preserves out, in and k registers. 8085 * len and offset point to the appropriate index in "in" & "out" correspondingly 8086 * tmp5 has the carry. 8087 * other registers are temporary and are modified. 8088 * 8089 */ 8090 void MacroAssembler::mul_add_128_x_32_loop(Register out, Register in, 8091 Register offset, Register len, Register tmp1, Register tmp2, Register tmp3, 8092 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8093 8094 Label L_first_loop, L_first_loop_exit; 8095 8096 movl(tmp1, len); 8097 shrl(tmp1, 2); 8098 8099 bind(L_first_loop); 8100 subl(tmp1, 1); 8101 jccb(Assembler::negative, L_first_loop_exit); 8102 8103 subl(len, 4); 8104 subl(offset, 4); 8105 8106 Register op2 = tmp2; 8107 const Register sum = tmp3; 8108 const Register op1 = tmp4; 8109 const Register carry = tmp5; 8110 8111 if (UseBMI2Instructions) { 8112 op2 = rdxReg; 8113 } 8114 8115 movq(op1, Address(in, len, Address::times_4, 8)); 8116 rorq(op1, 32); 8117 movq(sum, Address(out, offset, Address::times_4, 8)); 8118 rorq(sum, 32); 8119 if (UseBMI2Instructions) { 8120 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8121 } 8122 else { 8123 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8124 } 8125 // Store back in big endian from little endian 8126 rorq(sum, 0x20); 8127 movq(Address(out, offset, Address::times_4, 8), sum); 8128 8129 movq(op1, Address(in, len, Address::times_4, 0)); 8130 rorq(op1, 32); 8131 movq(sum, Address(out, offset, Address::times_4, 0)); 8132 rorq(sum, 32); 8133 if (UseBMI2Instructions) { 8134 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8135 } 8136 else { 8137 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8138 } 8139 // Store back in big endian from little endian 8140 rorq(sum, 0x20); 8141 movq(Address(out, offset, Address::times_4, 0), sum); 8142 8143 jmp(L_first_loop); 8144 bind(L_first_loop_exit); 8145 } 8146 8147 /** 8148 * Code for BigInteger::mulAdd() intrinsic 8149 * 8150 * rdi: out 8151 * rsi: in 8152 * r11: offs (out.length - offset) 8153 * rcx: len 8154 * r8: k 8155 * r12: tmp1 8156 * r13: tmp2 8157 * r14: tmp3 8158 * r15: tmp4 8159 * rbx: tmp5 8160 * Multiply the in[] by word k and add to out[], return the carry in rax 8161 */ 8162 void MacroAssembler::mul_add(Register out, Register in, Register offs, 8163 Register len, Register k, Register tmp1, Register tmp2, Register tmp3, 8164 Register tmp4, Register tmp5, Register rdxReg, Register raxReg) { 8165 8166 Label L_carry, L_last_in, L_done; 8167 8168 // carry = 0; 8169 // for (int j=len-1; j >= 0; j--) { 8170 // long product = (in[j] & LONG_MASK) * kLong + 8171 // (out[offs] & LONG_MASK) + carry; 8172 // out[offs--] = (int)product; 8173 // carry = product >>> 32; 8174 // } 8175 // 8176 push(tmp1); 8177 push(tmp2); 8178 push(tmp3); 8179 push(tmp4); 8180 push(tmp5); 8181 8182 Register op2 = tmp2; 8183 const Register sum = tmp3; 8184 const Register op1 = tmp4; 8185 const Register carry = tmp5; 8186 8187 if (UseBMI2Instructions) { 8188 op2 = rdxReg; 8189 movl(op2, k); 8190 } 8191 else { 8192 movl(op2, k); 8193 } 8194 8195 xorq(carry, carry); 8196 8197 //First loop 8198 8199 //Multiply in[] by k in a 4 way unrolled loop using 128 bit by 32 bit multiply 8200 //The carry is in tmp5 8201 mul_add_128_x_32_loop(out, in, offs, len, tmp1, tmp2, tmp3, tmp4, tmp5, rdxReg, raxReg); 8202 8203 //Multiply the trailing in[] entry using 64 bit by 32 bit, if any 8204 decrementl(len); 8205 jccb(Assembler::negative, L_carry); 8206 decrementl(len); 8207 jccb(Assembler::negative, L_last_in); 8208 8209 movq(op1, Address(in, len, Address::times_4, 0)); 8210 rorq(op1, 32); 8211 8212 subl(offs, 2); 8213 movq(sum, Address(out, offs, Address::times_4, 0)); 8214 rorq(sum, 32); 8215 8216 if (UseBMI2Instructions) { 8217 multiply_add_64_bmi2(sum, op1, op2, carry, raxReg); 8218 } 8219 else { 8220 multiply_add_64(sum, op1, op2, carry, rdxReg, raxReg); 8221 } 8222 8223 // Store back in big endian from little endian 8224 rorq(sum, 0x20); 8225 movq(Address(out, offs, Address::times_4, 0), sum); 8226 8227 testl(len, len); 8228 jccb(Assembler::zero, L_carry); 8229 8230 //Multiply the last in[] entry, if any 8231 bind(L_last_in); 8232 movl(op1, Address(in, 0)); 8233 movl(sum, Address(out, offs, Address::times_4, -4)); 8234 8235 movl(raxReg, k); 8236 mull(op1); //tmp4 * eax -> edx:eax 8237 addl(sum, carry); 8238 adcl(rdxReg, 0); 8239 addl(sum, raxReg); 8240 adcl(rdxReg, 0); 8241 movl(carry, rdxReg); 8242 8243 movl(Address(out, offs, Address::times_4, -4), sum); 8244 8245 bind(L_carry); 8246 //return tmp5/carry as carry in rax 8247 movl(rax, carry); 8248 8249 bind(L_done); 8250 pop(tmp5); 8251 pop(tmp4); 8252 pop(tmp3); 8253 pop(tmp2); 8254 pop(tmp1); 8255 } 8256 #endif 8257 8258 /** 8259 * Emits code to update CRC-32 with a byte value according to constants in table 8260 * 8261 * @param [in,out]crc Register containing the crc. 8262 * @param [in]val Register containing the byte to fold into the CRC. 8263 * @param [in]table Register containing the table of crc constants. 8264 * 8265 * uint32_t crc; 8266 * val = crc_table[(val ^ crc) & 0xFF]; 8267 * crc = val ^ (crc >> 8); 8268 * 8269 */ 8270 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 8271 xorl(val, crc); 8272 andl(val, 0xFF); 8273 shrl(crc, 8); // unsigned shift 8274 xorl(crc, Address(table, val, Address::times_4, 0)); 8275 } 8276 8277 /** 8278 * Fold 128-bit data chunk 8279 */ 8280 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, Register buf, int offset) { 8281 if (UseAVX > 0) { 8282 vpclmulhdq(xtmp, xK, xcrc); // [123:64] 8283 vpclmulldq(xcrc, xK, xcrc); // [63:0] 8284 vpxor(xcrc, xcrc, Address(buf, offset), 0 /* vector_len */); 8285 pxor(xcrc, xtmp); 8286 } else { 8287 movdqa(xtmp, xcrc); 8288 pclmulhdq(xtmp, xK); // [123:64] 8289 pclmulldq(xcrc, xK); // [63:0] 8290 pxor(xcrc, xtmp); 8291 movdqu(xtmp, Address(buf, offset)); 8292 pxor(xcrc, xtmp); 8293 } 8294 } 8295 8296 void MacroAssembler::fold_128bit_crc32(XMMRegister xcrc, XMMRegister xK, XMMRegister xtmp, XMMRegister xbuf) { 8297 if (UseAVX > 0) { 8298 vpclmulhdq(xtmp, xK, xcrc); 8299 vpclmulldq(xcrc, xK, xcrc); 8300 pxor(xcrc, xbuf); 8301 pxor(xcrc, xtmp); 8302 } else { 8303 movdqa(xtmp, xcrc); 8304 pclmulhdq(xtmp, xK); 8305 pclmulldq(xcrc, xK); 8306 pxor(xcrc, xbuf); 8307 pxor(xcrc, xtmp); 8308 } 8309 } 8310 8311 /** 8312 * 8-bit folds to compute 32-bit CRC 8313 * 8314 * uint64_t xcrc; 8315 * timesXtoThe32[xcrc & 0xFF] ^ (xcrc >> 8); 8316 */ 8317 void MacroAssembler::fold_8bit_crc32(XMMRegister xcrc, Register table, XMMRegister xtmp, Register tmp) { 8318 movdl(tmp, xcrc); 8319 andl(tmp, 0xFF); 8320 movdl(xtmp, Address(table, tmp, Address::times_4, 0)); 8321 psrldq(xcrc, 1); // unsigned shift one byte 8322 pxor(xcrc, xtmp); 8323 } 8324 8325 /** 8326 * uint32_t crc; 8327 * timesXtoThe32[crc & 0xFF] ^ (crc >> 8); 8328 */ 8329 void MacroAssembler::fold_8bit_crc32(Register crc, Register table, Register tmp) { 8330 movl(tmp, crc); 8331 andl(tmp, 0xFF); 8332 shrl(crc, 8); 8333 xorl(crc, Address(table, tmp, Address::times_4, 0)); 8334 } 8335 8336 /** 8337 * @param crc register containing existing CRC (32-bit) 8338 * @param buf register pointing to input byte buffer (byte*) 8339 * @param len register containing number of bytes 8340 * @param table register that will contain address of CRC table 8341 * @param tmp scratch register 8342 */ 8343 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, Register table, Register tmp) { 8344 assert_different_registers(crc, buf, len, table, tmp, rax); 8345 8346 Label L_tail, L_tail_restore, L_tail_loop, L_exit, L_align_loop, L_aligned; 8347 Label L_fold_tail, L_fold_128b, L_fold_512b, L_fold_512b_loop, L_fold_tail_loop; 8348 8349 lea(table, ExternalAddress(StubRoutines::crc_table_addr())); 8350 notl(crc); // ~crc 8351 cmpl(len, 16); 8352 jcc(Assembler::less, L_tail); 8353 8354 // Align buffer to 16 bytes 8355 movl(tmp, buf); 8356 andl(tmp, 0xF); 8357 jccb(Assembler::zero, L_aligned); 8358 subl(tmp, 16); 8359 addl(len, tmp); 8360 8361 align(4); 8362 BIND(L_align_loop); 8363 movsbl(rax, Address(buf, 0)); // load byte with sign extension 8364 update_byte_crc32(crc, rax, table); 8365 increment(buf); 8366 incrementl(tmp); 8367 jccb(Assembler::less, L_align_loop); 8368 8369 BIND(L_aligned); 8370 movl(tmp, len); // save 8371 shrl(len, 4); 8372 jcc(Assembler::zero, L_tail_restore); 8373 8374 // Fold crc into first bytes of vector 8375 movdqa(xmm1, Address(buf, 0)); 8376 movdl(rax, xmm1); 8377 xorl(crc, rax); 8378 pinsrd(xmm1, crc, 0); 8379 addptr(buf, 16); 8380 subl(len, 4); // len > 0 8381 jcc(Assembler::less, L_fold_tail); 8382 8383 movdqa(xmm2, Address(buf, 0)); 8384 movdqa(xmm3, Address(buf, 16)); 8385 movdqa(xmm4, Address(buf, 32)); 8386 addptr(buf, 48); 8387 subl(len, 3); 8388 jcc(Assembler::lessEqual, L_fold_512b); 8389 8390 // Fold total 512 bits of polynomial on each iteration, 8391 // 128 bits per each of 4 parallel streams. 8392 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 32)); 8393 8394 align(32); 8395 BIND(L_fold_512b_loop); 8396 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 8397 fold_128bit_crc32(xmm2, xmm0, xmm5, buf, 16); 8398 fold_128bit_crc32(xmm3, xmm0, xmm5, buf, 32); 8399 fold_128bit_crc32(xmm4, xmm0, xmm5, buf, 48); 8400 addptr(buf, 64); 8401 subl(len, 4); 8402 jcc(Assembler::greater, L_fold_512b_loop); 8403 8404 // Fold 512 bits to 128 bits. 8405 BIND(L_fold_512b); 8406 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 8407 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm2); 8408 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm3); 8409 fold_128bit_crc32(xmm1, xmm0, xmm5, xmm4); 8410 8411 // Fold the rest of 128 bits data chunks 8412 BIND(L_fold_tail); 8413 addl(len, 3); 8414 jccb(Assembler::lessEqual, L_fold_128b); 8415 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr() + 16)); 8416 8417 BIND(L_fold_tail_loop); 8418 fold_128bit_crc32(xmm1, xmm0, xmm5, buf, 0); 8419 addptr(buf, 16); 8420 decrementl(len); 8421 jccb(Assembler::greater, L_fold_tail_loop); 8422 8423 // Fold 128 bits in xmm1 down into 32 bits in crc register. 8424 BIND(L_fold_128b); 8425 movdqu(xmm0, ExternalAddress(StubRoutines::x86::crc_by128_masks_addr())); 8426 if (UseAVX > 0) { 8427 vpclmulqdq(xmm2, xmm0, xmm1, 0x1); 8428 vpand(xmm3, xmm0, xmm2, 0 /* vector_len */); 8429 vpclmulqdq(xmm0, xmm0, xmm3, 0x1); 8430 } else { 8431 movdqa(xmm2, xmm0); 8432 pclmulqdq(xmm2, xmm1, 0x1); 8433 movdqa(xmm3, xmm0); 8434 pand(xmm3, xmm2); 8435 pclmulqdq(xmm0, xmm3, 0x1); 8436 } 8437 psrldq(xmm1, 8); 8438 psrldq(xmm2, 4); 8439 pxor(xmm0, xmm1); 8440 pxor(xmm0, xmm2); 8441 8442 // 8 8-bit folds to compute 32-bit CRC. 8443 for (int j = 0; j < 4; j++) { 8444 fold_8bit_crc32(xmm0, table, xmm1, rax); 8445 } 8446 movdl(crc, xmm0); // mov 32 bits to general register 8447 for (int j = 0; j < 4; j++) { 8448 fold_8bit_crc32(crc, table, rax); 8449 } 8450 8451 BIND(L_tail_restore); 8452 movl(len, tmp); // restore 8453 BIND(L_tail); 8454 andl(len, 0xf); 8455 jccb(Assembler::zero, L_exit); 8456 8457 // Fold the rest of bytes 8458 align(4); 8459 BIND(L_tail_loop); 8460 movsbl(rax, Address(buf, 0)); // load byte with sign extension 8461 update_byte_crc32(crc, rax, table); 8462 increment(buf); 8463 decrementl(len); 8464 jccb(Assembler::greater, L_tail_loop); 8465 8466 BIND(L_exit); 8467 notl(crc); // ~c 8468 } 8469 8470 namespace CRC32C { 8471 #include "crc32c.h" 8472 8473 #define Nehalem(x) x 8474 #define Westmere(x) x 8475 8476 #undef IN 8477 #define IN(x) x 8478 #define INOUT(x) x 8479 #undef OUT 8480 #define OUT(x) x 8481 #define Scratch(x) x 8482 8483 #undef D 8484 8485 #ifdef _LP64 8486 // S. Gueron / Information Processing Letters 112 (2012) 184 8487 // Algorithm 4: Computing carry-less multiplication using a precomputed lookup table. 8488 // Input: A 32 bit value B = [byte3, byte2, byte1, byte0]. 8489 // Output: the 64-bit carry-less product of B * CONST 8490 void IPL_Alg4(INOUT(Register B), uint32_t n, 8491 Scratch(Register C), Scratch(Register D), Scratch(Register Z), 8492 MacroAssembler * This) { 8493 This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); 8494 if (n > 0) { 8495 This->addq(Z, n * 256 * 8); 8496 } 8497 // Q1 = TABLEExt[n][B & 0xFF]; 8498 This->movl(C, B); 8499 This->andl(C, 0x000000FF); 8500 This->shll(C, 3); 8501 This->addq(C, Z); 8502 This->movq(C, Address(C, 0)); 8503 8504 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8505 This->movl(D, B); 8506 This->shrl(D, 8); 8507 This->andl(D, 0x000000FF); 8508 This->shll(D, 3); 8509 This->addq(D, Z); 8510 This->movq(D, Address(D, 0)); 8511 8512 This->shlq(D, 8); 8513 This->xorq(C, D); 8514 8515 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8516 This->movl(D, B); 8517 This->shrl(D, 16); 8518 This->andl(D, 0x000000FF); 8519 This->shll(D, 3); 8520 This->addq(D, Z); 8521 This->movq(D, Address(D, 0)); 8522 8523 This->shlq(D, 16); 8524 This->xorq(C, D); 8525 8526 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8527 This->shrl(B, 24); 8528 This->andl(B, 0x000000FF); 8529 This->shll(B, 3); 8530 This->addq(B, Z); 8531 This->movq(B, Address(B, 0)); 8532 8533 This->shlq(B, 24); 8534 This->xorq(B, C); 8535 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8536 } 8537 8538 void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), 8539 INOUT(Register crc), 8540 uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, 8541 Westmere(Scratch(XMMRegister DXMM)), 8542 Scratch(Register A), 8543 Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), 8544 MacroAssembler * This) { 8545 if (IsPclmulqdqSupported) { 8546 This->movdl(crcXMM, crc); // modified blindly 8547 8548 This->movl(A, CONSTOrPreCompConstIndex); 8549 This->movdl(DXMM, A); 8550 This->pclmulqdq(crcXMM, DXMM, 0); 8551 8552 This->movdq(crc, crcXMM); 8553 } else { 8554 IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, This); 8555 } 8556 } 8557 8558 // Recombination Alternative 2: No bit-reflections 8559 // T1 = (CRC_A * U1) << 1 8560 // T2 = (CRC_B * U2) << 1 8561 // C1 = T1 >> 32 8562 // C2 = T2 >> 32 8563 // T1 = T1 & 0xFFFFFFFF 8564 // T2 = T2 & 0xFFFFFFFF 8565 // T1 = CRC32(0, T1) 8566 // T2 = CRC32(0, T2) 8567 // C1 = C1 ^ T1 8568 // C2 = C2 ^ T2 8569 // CRC = C1 ^ C2 ^ CRC_C 8570 void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), 8571 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8572 Scratch(Register E), Scratch(Register F), 8573 Nehalem(Scratch(Register G)), 8574 MacroAssembler * This) { 8575 PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); 8576 PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); 8577 This->shlq(crcA, 1); 8578 This->movl(E, crcA); 8579 This->shrq(crcA, 32); 8580 This->xorl(F, F); 8581 This->crc32(F, E, 4); 8582 This->xorl(crcA, F); // we don't care about upper 32 bit contents here 8583 This->shlq(crcB, 1); 8584 This->movl(E, crcB); 8585 This->shrq(crcB, 32); 8586 This->xorl(F, F); 8587 This->crc32(F, E, 4); 8588 This->xorl(crcB, F); 8589 This->xorl(crcA, crcB); 8590 This->xorl(crcA, crcC); 8591 } 8592 8593 // Set N to predefined value 8594 // Subtract from a lenght of a buffer 8595 // execute in a loop: 8596 // CRC_A = 0xFFFFFFFF, CRC_B = 0, CRC_C = 0 8597 // for i = 1 to N do 8598 // CRC_A = CRC32(CRC_A, A[i]) 8599 // CRC_B = CRC32(CRC_B, B[i]) 8600 // CRC_C = CRC32(CRC_C, C[i]) 8601 // end for 8602 // Recombine 8603 void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, 8604 INOUT(Register len), INOUT(Register buf), INOUT(Register crc), 8605 Scratch(Register E), Scratch(Register F), Scratch(Register end), 8606 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8607 Scratch(Register G), Scratch(Register H), 8608 Nehalem(Scratch(Register I)), 8609 MacroAssembler * This) { 8610 Label L_processPartitions; 8611 Label L_processPartition; 8612 Label L_exit; 8613 8614 This->bind(L_processPartitions); 8615 This->cmpl(len, 3 * size); 8616 This->jcc(Assembler::less, L_exit); 8617 This->xorl(E, E); 8618 This->xorl(F, F); 8619 This->movq(end, buf); 8620 This->addq(end, size); 8621 8622 This->bind(L_processPartition); 8623 This->crc32(crc, Address(buf, 0), 8); 8624 This->crc32(E, Address(buf, size), 8); 8625 This->crc32(F, Address(buf, size * 2), 8); 8626 This->addq(buf, 8); 8627 This->cmpq(buf, end); 8628 This->jcc(Assembler::less, L_processPartition); 8629 RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, 8630 AXMM, BXMM, CXMM, 8631 G, H, 8632 I, 8633 This); 8634 This->addq(buf, 2 * size); 8635 This->subl(len, 3 * size); 8636 This->jmp(L_processPartitions); 8637 8638 This->bind(L_exit); 8639 } 8640 #else 8641 void IPL_Alg4(INOUT(Register B), uint32_t n, 8642 Scratch(Register C), Scratch(Register D), Scratch(Register Z), 8643 Scratch(XMMRegister CXMM), Scratch(XMMRegister DXMM), 8644 MacroAssembler * This) { 8645 This->lea(Z, ExternalAddress(StubRoutines::crc32c_table_addr())); 8646 if (n > 0) { 8647 This->addl(Z, n * 256 * 8); 8648 } 8649 // Q1 = TABLEExt[n][B & 0xFF]; 8650 This->movl(C, B); 8651 This->andl(C, 0x000000FF); 8652 This->shll(C, 3); 8653 This->addl(C, Z); 8654 This->movq(CXMM, Address(C, 0)); 8655 8656 // Q2 = TABLEExt[n][B >> 8 & 0xFF]; 8657 This->movl(D, B); 8658 This->shrl(D, 8); 8659 This->andl(D, 0x000000FF); 8660 This->shll(D, 3); 8661 This->addl(D, Z); 8662 This->movq(DXMM, Address(D, 0)); 8663 8664 This->psllq(DXMM, 8); 8665 This->pxor(CXMM, DXMM); 8666 8667 // Q3 = TABLEExt[n][B >> 16 & 0xFF]; 8668 This->movl(D, B); 8669 This->shrl(D, 16); 8670 This->andl(D, 0x000000FF); 8671 This->shll(D, 3); 8672 This->addl(D, Z); 8673 This->movq(DXMM, Address(D, 0)); 8674 8675 This->psllq(DXMM, 16); 8676 This->pxor(CXMM, DXMM); 8677 8678 // Q4 = TABLEExt[n][B >> 24 & 0xFF]; 8679 This->shrl(B, 24); 8680 This->andl(B, 0x000000FF); 8681 This->shll(B, 3); 8682 This->addl(B, Z); 8683 This->movq(DXMM, Address(B, 0)); 8684 8685 This->psllq(DXMM, 24); 8686 This->pxor(CXMM, DXMM); // Result in CXMM 8687 // return Q1 ^ Q2 << 8 ^ Q3 << 16 ^ Q4 << 24; 8688 } 8689 8690 void PCLMULQDQ(Westmere(Scratch(XMMRegister crcXMM)), 8691 INOUT(Register crc), 8692 uint32_t CONSTOrPreCompConstIndex, bool IsPclmulqdqSupported, 8693 Westmere(Scratch(XMMRegister DXMM)), 8694 Scratch(Register A), 8695 Nehalem(Scratch(Register B)), Nehalem(Scratch(Register C)), 8696 MacroAssembler * This) { 8697 if (IsPclmulqdqSupported) { 8698 This->movdl(crcXMM, crc); 8699 8700 This->movl(A, CONSTOrPreCompConstIndex); 8701 This->movdl(DXMM, A); 8702 This->pclmulqdq(crcXMM, DXMM, 0); 8703 // Keep result in XMM since GPR is 32 bit in length 8704 } else { 8705 IPL_Alg4(crc, CONSTOrPreCompConstIndex, A, B, C, crcXMM, DXMM, This); 8706 } 8707 } 8708 8709 void RecAlt2(uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, INOUT(Register crcA), IN(Scratch(Register crcB)), IN(Register crcC), 8710 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8711 Scratch(Register E), Scratch(Register F), 8712 Nehalem(Scratch(Register G)), 8713 MacroAssembler * This) { 8714 PCLMULQDQ(AXMM, crcA, CONSTOrPreCompConstIndexU1, IsPclmulqdqSupported, CXMM, E, F, G, This); 8715 PCLMULQDQ(BXMM, crcB, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, CXMM, E, F, G, This); 8716 8717 This->psllq(AXMM, 1); 8718 This->movdl(E, AXMM); 8719 This->psrlq(AXMM, 32); 8720 This->movdl(crcA, AXMM); 8721 8722 This->xorl(F, F); 8723 This->crc32(F, E, 4); 8724 This->xorl(crcA, F); 8725 8726 This->psllq(BXMM, 1); 8727 This->movdl(E, BXMM); 8728 This->psrlq(BXMM, 32); 8729 This->movdl(crcB, BXMM); 8730 8731 This->xorl(F, F); 8732 This->crc32(F, E, 4); 8733 This->xorl(crcB, F); 8734 This->xorl(crcA, crcB); 8735 This->xorl(crcA, crcC); 8736 } 8737 8738 void ProcChunk(uint32_t size, uint32_t CONSTOrPreCompConstIndexU1, uint32_t CONSTOrPreCompConstIndexU2, bool IsPclmulqdqSupported, 8739 INOUT(Register len), INOUT(Register buf), INOUT(Register crc), 8740 Scratch(Register E), Scratch(Register F), Scratch(Register end), 8741 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8742 Scratch(Register G), Scratch(Register H), 8743 Nehalem(Scratch(Register I)), 8744 MacroAssembler * This) { 8745 Label L_processPartitions; 8746 Label L_processPartition; 8747 Label L_exit; 8748 8749 This->bind(L_processPartitions); 8750 This->cmpl(len, 3 * size); 8751 This->jcc(Assembler::less, L_exit); 8752 This->xorl(E, E); 8753 This->xorl(F, F); 8754 This->movl(end, buf); 8755 This->addl(end, size); 8756 8757 This->bind(L_processPartition); 8758 This->crc32(crc, Address(buf, 0), 4); 8759 This->crc32(E, Address(buf, size), 4); 8760 This->crc32(F, Address(buf, size*2), 4); 8761 This->crc32(crc, Address(buf, 0+4), 4); 8762 This->crc32(E, Address(buf, size+4), 4); 8763 This->crc32(F, Address(buf, size*2+4), 4); 8764 This->addl(buf, 8); 8765 This->cmpl(buf, end); 8766 This->jcc(Assembler::less, L_processPartition); 8767 8768 This->push(end); 8769 This->push(len); 8770 This->push(buf); 8771 G = end; 8772 H = len; 8773 I = buf; 8774 8775 RecAlt2(CONSTOrPreCompConstIndexU1, CONSTOrPreCompConstIndexU2, IsPclmulqdqSupported, crc, E, F, 8776 AXMM, BXMM, CXMM, 8777 G, H, 8778 I, 8779 This); 8780 8781 This->pop(buf); 8782 This->pop(len); 8783 This->pop(end); 8784 8785 This->addl(buf, 2 * size); 8786 This->subl(len, 3 * size); 8787 This->jmp(L_processPartitions); 8788 8789 This->bind(L_exit); 8790 } 8791 #endif //LP64 8792 } 8793 #undef D 8794 8795 #ifdef _LP64 8796 // Algorithm 2: Pipelined usage of the CRC32 instruction. 8797 // Input: A buffer I of L bytes. 8798 // Output: the CRC32C value of the buffer. 8799 // Notations: 8800 // Write L = 24N + r, with N = floor (L/24). 8801 // r = L mod 24 (0 <= r < 24). 8802 // Consider I as the concatenation of A|B|C|R, where A, B, C, each, 8803 // N quadwords, and R consists of r bytes. 8804 // A[j] = I [8j+7:8j], j= 0, 1, ..., N-1 8805 // B[j] = I [N + 8j+7:N + 8j], j= 0, 1, ..., N-1 8806 // C[j] = I [2N + 8j+7:2N + 8j], j= 0, 1, ..., N-1 8807 // if r > 0 R[j] = I [3N +j], j= 0, 1, ...,r-1 8808 void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, 8809 Scratch(Register A), Scratch(Register B), Scratch(Register C), 8810 Scratch(Register D), Scratch(Register E), Scratch(Register F), 8811 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8812 bool IsPclmulqdqSupported) { 8813 uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; 8814 Label L_wordByWord; 8815 Label L_byteByByteProlog; 8816 Label L_byteByByte; 8817 Label L_exit; 8818 8819 if (IsPclmulqdqSupported ) { 8820 CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8821 CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr+1); 8822 8823 CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8824 CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8825 8826 CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8827 CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8828 assert((CRC32C::NUM_PRECOMPUTED_CONSTANTS - 1 ) == 5, "Checking whether you declared all of the constants based on the number of \"chunks\""); 8829 } else { 8830 CONSTOrPreCompConstIndex[0] = 1; 8831 CONSTOrPreCompConstIndex[1] = 0; 8832 8833 CONSTOrPreCompConstIndex[2] = 3; 8834 CONSTOrPreCompConstIndex[3] = 2; 8835 8836 CONSTOrPreCompConstIndex[4] = 5; 8837 CONSTOrPreCompConstIndex[5] = 4; 8838 } 8839 CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, 8840 len, buf, crc, 8841 A, B, C, 8842 AXMM, BXMM, CXMM, 8843 D, E, 8844 F, 8845 this); 8846 CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, 8847 len, buf, crc, 8848 A, B, C, 8849 AXMM, BXMM, CXMM, 8850 D, E, 8851 F, 8852 this); 8853 CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, 8854 len, buf, crc, 8855 A, B, C, 8856 AXMM, BXMM, CXMM, 8857 D, E, 8858 F, 8859 this); 8860 movl(A, len); 8861 andl(A, 0x00000007); 8862 negl(A); 8863 addl(A, len); 8864 addq(A, buf); 8865 8866 BIND(L_wordByWord); 8867 cmpq(buf, A); 8868 jcc(Assembler::greaterEqual, L_byteByByteProlog); 8869 crc32(crc, Address(buf, 0), 4); 8870 addq(buf, 4); 8871 jmp(L_wordByWord); 8872 8873 BIND(L_byteByByteProlog); 8874 andl(len, 0x00000007); 8875 movl(B, 1); 8876 8877 BIND(L_byteByByte); 8878 cmpl(B, len); 8879 jccb(Assembler::greater, L_exit); 8880 crc32(crc, Address(buf, 0), 1); 8881 incq(buf); 8882 incl(B); 8883 jmp(L_byteByByte); 8884 8885 BIND(L_exit); 8886 } 8887 #else 8888 void MacroAssembler::crc32c_IPL_Alg2Alt2Fast(Register crc, Register buf, Register len, 8889 Scratch(Register A), Scratch(Register B), Scratch(Register C), 8890 Scratch(Register D), Scratch(Register E), Scratch(Register F), 8891 Westmere(Scratch(XMMRegister AXMM)), Westmere(Scratch(XMMRegister BXMM)), Westmere(Scratch(XMMRegister CXMM)), 8892 bool IsPclmulqdqSupported) { 8893 uint32_t CONSTOrPreCompConstIndex[CRC32C::NUM_PRECOMPUTED_CONSTANTS]; 8894 Label L_wordByWord; 8895 Label L_byteByByteProlog; 8896 Label L_byteByByte; 8897 Label L_exit; 8898 8899 if (IsPclmulqdqSupported) { 8900 CONSTOrPreCompConstIndex[1] = *(uint32_t *)StubRoutines::_crc32c_table_addr; 8901 CONSTOrPreCompConstIndex[0] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 1); 8902 8903 CONSTOrPreCompConstIndex[3] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 2); 8904 CONSTOrPreCompConstIndex[2] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 3); 8905 8906 CONSTOrPreCompConstIndex[5] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 4); 8907 CONSTOrPreCompConstIndex[4] = *((uint32_t *)StubRoutines::_crc32c_table_addr + 5); 8908 } else { 8909 CONSTOrPreCompConstIndex[0] = 1; 8910 CONSTOrPreCompConstIndex[1] = 0; 8911 8912 CONSTOrPreCompConstIndex[2] = 3; 8913 CONSTOrPreCompConstIndex[3] = 2; 8914 8915 CONSTOrPreCompConstIndex[4] = 5; 8916 CONSTOrPreCompConstIndex[5] = 4; 8917 } 8918 CRC32C::ProcChunk(CRC32C::HIGH, CONSTOrPreCompConstIndex[0], CONSTOrPreCompConstIndex[1], IsPclmulqdqSupported, 8919 len, buf, crc, 8920 A, B, C, 8921 AXMM, BXMM, CXMM, 8922 D, E, 8923 F, 8924 this); 8925 CRC32C::ProcChunk(CRC32C::MIDDLE, CONSTOrPreCompConstIndex[2], CONSTOrPreCompConstIndex[3], IsPclmulqdqSupported, 8926 len, buf, crc, 8927 A, B, C, 8928 AXMM, BXMM, CXMM, 8929 D, E, 8930 F, 8931 this); 8932 CRC32C::ProcChunk(CRC32C::LOW, CONSTOrPreCompConstIndex[4], CONSTOrPreCompConstIndex[5], IsPclmulqdqSupported, 8933 len, buf, crc, 8934 A, B, C, 8935 AXMM, BXMM, CXMM, 8936 D, E, 8937 F, 8938 this); 8939 movl(A, len); 8940 andl(A, 0x00000007); 8941 negl(A); 8942 addl(A, len); 8943 addl(A, buf); 8944 8945 BIND(L_wordByWord); 8946 cmpl(buf, A); 8947 jcc(Assembler::greaterEqual, L_byteByByteProlog); 8948 crc32(crc, Address(buf,0), 4); 8949 addl(buf, 4); 8950 jmp(L_wordByWord); 8951 8952 BIND(L_byteByByteProlog); 8953 andl(len, 0x00000007); 8954 movl(B, 1); 8955 8956 BIND(L_byteByByte); 8957 cmpl(B, len); 8958 jccb(Assembler::greater, L_exit); 8959 movb(A, Address(buf, 0)); 8960 crc32(crc, A, 1); 8961 incl(buf); 8962 incl(B); 8963 jmp(L_byteByByte); 8964 8965 BIND(L_exit); 8966 } 8967 #endif // LP64 8968 8969 8970 Assembler::Condition MacroAssembler::negate_condition(Assembler::Condition cond) { 8971 switch (cond) { 8972 // Note some conditions are synonyms for others 8973 case Assembler::zero: return Assembler::notZero; 8974 case Assembler::notZero: return Assembler::zero; 8975 case Assembler::less: return Assembler::greaterEqual; 8976 case Assembler::lessEqual: return Assembler::greater; 8977 case Assembler::greater: return Assembler::lessEqual; 8978 case Assembler::greaterEqual: return Assembler::less; 8979 case Assembler::below: return Assembler::aboveEqual; 8980 case Assembler::belowEqual: return Assembler::above; 8981 case Assembler::above: return Assembler::belowEqual; 8982 case Assembler::aboveEqual: return Assembler::below; 8983 case Assembler::overflow: return Assembler::noOverflow; 8984 case Assembler::noOverflow: return Assembler::overflow; 8985 case Assembler::negative: return Assembler::positive; 8986 case Assembler::positive: return Assembler::negative; 8987 case Assembler::parity: return Assembler::noParity; 8988 case Assembler::noParity: return Assembler::parity; 8989 } 8990 ShouldNotReachHere(); return Assembler::overflow; 8991 } 8992 8993 SkipIfEqual::SkipIfEqual( 8994 MacroAssembler* masm, const bool* flag_addr, bool value) { 8995 _masm = masm; 8996 _masm->cmp8(ExternalAddress((address)flag_addr), value); 8997 _masm->jcc(Assembler::equal, _label); 8998 } 8999 9000 SkipIfEqual::~SkipIfEqual() { 9001 _masm->bind(_label); 9002 }