1 /* 2 * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * Copyright (c) 2015, Linaro Ltd. All rights reserved. 5 * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved. 6 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 7 * 8 * This code is free software; you can redistribute it and/or modify it 9 * under the terms of the GNU General Public License version 2 only, as 10 * published by the Free Software Foundation. 11 * 12 * This code is distributed in the hope that it will be useful, but WITHOUT 13 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 14 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 15 * version 2 for more details (a copy is included in the LICENSE file that 16 * accompanied this code). 17 * 18 * You should have received a copy of the GNU General Public License version 19 * 2 along with this work; if not, write to the Free Software Foundation, 20 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 21 * 22 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 23 * or visit www.oracle.com if you need additional information or have any 24 * questions. 25 * 26 */ 27 28 #include <sys/types.h> 29 30 #include "precompiled.hpp" 31 #include "jvm.h" 32 #include "asm/assembler.hpp" 33 #include "asm/assembler.inline.hpp" 34 #include "gc/shared/barrierSet.hpp" 35 #include "gc/shared/cardTable.hpp" 36 #include "gc/shared/barrierSetAssembler.hpp" 37 #include "gc/shared/cardTableBarrierSet.hpp" 38 #include "interpreter/interpreter.hpp" 39 #include "compiler/disassembler.hpp" 40 #include "memory/resourceArea.hpp" 41 #include "nativeInst_aarch32.hpp" 42 #include "oops/accessDecorators.hpp" 43 //This ifdef was introduced so a core build can be built 44 #ifdef COMPILER2 45 #include "opto/compile.hpp" 46 #include "opto/node.hpp" 47 #endif 48 49 #include "runtime/biasedLocking.hpp" 50 #include "runtime/icache.hpp" 51 #include "runtime/interfaceSupport.inline.hpp" 52 #include "runtime/jniHandles.inline.hpp" 53 #include "runtime/sharedRuntime.hpp" 54 55 #ifdef PRODUCT 56 #define BLOCK_COMMENT(str) /* nothing */ 57 #define STOP(error) stop(error) 58 #else 59 #define BLOCK_COMMENT(str) block_comment(str) 60 #define STOP(error) block_comment(error); stop(error) 61 #endif 62 63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 64 65 // FIXME This is not a nice fix, this constant was in a compiler2 header 66 #define MAX_stubs_size_div2 (128 / 2) 67 // FIXME END 68 69 // Note the corrections in the following three instructions for the PC. 70 // All literal modes that use the PC need to have the offset adjusted 71 // Patch any kind of instruction; there may be several instructions. 72 // Return the total length (in bytes) of the instructions. 73 74 int MacroAssembler::pd_patch_instruction_size(address branch, address target) { 75 // Note the corrections 76 int instructions = 1; 77 long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions 78 bool add = offset >= 0; 79 unsigned insn = *(unsigned*)branch; 80 int opc = Instruction_aarch32::extract(insn, 27, 24); 81 82 if(0b1010 == opc || 0b1011 == opc) { 83 // Branch or branch with link 84 assert(0 == (offset & 3), "not aligned correctly"); 85 Instruction_aarch32::spatch(branch, 23, 0, offset / 4); 86 } else if (0b0011 == opc) { 87 // Movw, Movt or mov, orr, orr, orr 88 // patch up address load to registers (absolute address). 89 instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz; 90 } else if (0b010 == (opc >> 1)) { 91 // LDR, LDRB, STR, STRB 92 Instruction_aarch32::patch(branch, 11, 0, uabs(offset)); 93 Instruction_aarch32::patch(branch, 23, 23, add); 94 } else if (0b000 == (opc >> 1)) { 95 // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD 96 offset = uabs(offset); 97 Instruction_aarch32::patch(branch, 3, 0, offset & 0xf); 98 Instruction_aarch32::patch(branch, 11, 8, offset >> 4); 99 Instruction_aarch32::patch(branch, 23, 23, add); 100 } else if (0b1101 == opc) { 101 // VLDR, VSTR - NOTE VSTR(lit) is deprecated 102 offset = uabs(offset); 103 assert(0 == (offset & 3), "vldr, vstr can't do unaligned access"); 104 Instruction_aarch32::patch(branch, 7, 0, offset >> 2); 105 Instruction_aarch32::patch(branch, 23, 23, add); 106 } else if (0b0010 == opc) { 107 // ADR 108 Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset))); 109 Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 ); 110 } else { 111 ShouldNotReachHere(); 112 } 113 // aarch64 had something for polling page load? 114 return instructions * NativeInstruction::arm_insn_sz; 115 } 116 117 int MacroAssembler::patch_oop(address insn_addr, address o) { 118 unsigned insn = *(unsigned*)insn_addr; 119 int opc = Instruction_aarch32::extract(insn, 27, 21); 120 if(0b0011000 == opc) { 121 //32-bit pointers, formed of a mov and a movt 122 assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch"); 123 124 uint32_t btm = (uint32_t)o & 0xffff; 125 Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12); 126 Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff); 127 uint32_t top = (uint32_t)o >> 16; 128 Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12); 129 Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff); 130 return 2 * NativeInstruction::arm_insn_sz; 131 } else if(0b0011101 == opc) { 132 //Instead 32bit load sequence uses mov, orr, orr, orr 133 assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch"); 134 assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch"); 135 assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch"); 136 // FIXME this could carry us outside valid memory 137 138 uint32_t addr = (uint32_t)o; 139 Instruction_aarch32::patch(insn_addr + 0, 11, 0, (0b0000 << 8) | ((addr >> 0) & 0xff)); 140 Instruction_aarch32::patch(insn_addr + 4, 11, 0, (0b1100 << 8) | ((addr >> 8) & 0xff)); 141 Instruction_aarch32::patch(insn_addr + 8, 11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff)); 142 Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff)); 143 return 4 * NativeInstruction::arm_insn_sz; 144 } else { 145 ShouldNotReachHere(); 146 } 147 return 0; //won't reach here 148 } 149 150 address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) { 151 long offset = 0; 152 int opc = Instruction_aarch32::extract(insn, 27, 24); 153 154 if(0b1010 == opc || 0b1011 == opc) { 155 // Branch or branch with link 156 offset = Instruction_aarch32::sextract(insn, 23, 0) * 4; 157 } else if (0b0011 == opc) { 158 unsigned *insn_buf = (unsigned*)insn_addr; 159 int opc2 = Instruction_aarch32::extract(insn, 23, 21); 160 if(0b000 == opc2) { 161 // movw, movt (only on newer ARMs) 162 assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch"); 163 uint32_t addr; 164 addr = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28; 165 addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16; 166 addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12; 167 addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0); 168 return address(addr); 169 } else if(0b101 == opc2) { 170 // mov, orr, orr, orr 171 assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch"); 172 assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch"); 173 assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch"); 174 uint32_t addr; 175 // TODO Check that the rotations are in the expected order. 176 addr = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0)); 177 addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0)); 178 addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0)); 179 addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0)); 180 return address(addr); 181 } else { 182 ShouldNotReachHere(); 183 } 184 } else if (0b010 == (opc >> 1)) { 185 // LDR, LDRB, STR, STRB 186 offset = Instruction_aarch32::extract(insn, 11, 0); 187 bool add = Instruction_aarch32::extract(insn, 23, 23); 188 offset = add ? offset : -offset; 189 } else if (0b000 == (opc >> 1)) { 190 // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD 191 offset = Instruction_aarch32::extract(insn, 3, 0); 192 offset |= Instruction_aarch32::extract(insn, 11, 8) << 4; 193 bool add = Instruction_aarch32::extract(insn, 23, 23); 194 offset = add ? offset : -offset; 195 } else if (0b1101 == opc) { 196 // VLDR, VSTR - NOTE VSTR(lit) is deprecated 197 offset = Instruction_aarch32::extract(insn, 7, 0) << 2; 198 bool add = Instruction_aarch32::extract(insn, 23, 23); 199 offset = add ? offset : -offset; 200 } else if (0b0010 == opc) { 201 // ADR 202 offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0)); 203 int code = Instruction_aarch32::extract(insn, 23, 22); 204 switch(code) { 205 case 0b01: offset = -offset; break; 206 case 0b10: break; 207 default: ShouldNotReachHere(); 208 } 209 } else { 210 ShouldNotReachHere(); 211 } 212 //Correct offset for PC 213 offset += 8; 214 return address(((uint32_t)insn_addr + offset)); 215 } 216 217 218 void MacroAssembler::serialize_memory(Register thread, Register tmp) { 219 dmb(Assembler::ISH); 220 } 221 222 void MacroAssembler::safepoint_poll(Label& slow_path) { 223 if (SafepointMechanism::uses_thread_local_poll()) { 224 ldr(rscratch1, Address(rthread, Thread::polling_page_offset())); 225 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 226 } else { 227 mov(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state())); 228 ldr(rscratch1, Address(rscratch1)); 229 cmp(rscratch1, SafepointSynchronize::_not_synchronized); 230 b(slow_path, Assembler::NE); 231 } 232 } 233 234 // Just like safepoint_poll, but use an acquiring load for thread- 235 // local polling. 236 // 237 // We need an acquire here to ensure that any subsequent load of the 238 // global SafepointSynchronize::_state flag is ordered after this load 239 // of the local Thread::_polling page. We don't want this poll to 240 // return false (i.e. not safepointing) and a later poll of the global 241 // SafepointSynchronize::_state spuriously to return true. 242 // 243 // This is to avoid a race when we're in a native->Java transition 244 // racing the code which wakes up from a safepoint. 245 // 246 void MacroAssembler::safepoint_poll_acquire(Label& slow_path) { 247 if (SafepointMechanism::uses_thread_local_poll()) { 248 lea(rscratch1, Address(rthread, Thread::polling_page_offset())); 249 ldr(rscratch1, rscratch1); 250 dmb(Assembler::ISH); 251 tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path); 252 } else { 253 safepoint_poll(slow_path); 254 } 255 } 256 257 void MacroAssembler::reset_last_Java_frame(bool clear_fp) { 258 mov(rscratch1, 0); 259 // we must set sp to zero to clear frame 260 str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset())); 261 // must clear fp, so that compiled frames are not confused; it is 262 // possible that we need it only for debugging 263 if (clear_fp) { 264 str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset())); 265 } 266 267 // Always clear the pc because it could have been set by make_walkable() 268 str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset())); 269 } 270 271 // Calls to C land 272 // 273 // When entering C land, the rfp & sp of the last Java frame have to be recorded 274 // in the (thread-local) JavaThread object. When leaving C land, the last Java fp 275 // has to be reset to 0. This is required to allow proper stack traversal. 276 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 277 Register last_java_fp, 278 Register last_java_pc, 279 Register scratch) { 280 281 if (last_java_pc->is_valid()) { 282 str(last_java_pc, Address(rthread, 283 JavaThread::frame_anchor_offset() 284 + JavaFrameAnchor::last_Java_pc_offset())); 285 } 286 287 // determine last_java_sp register 288 if (last_java_sp == sp) { 289 mov(scratch, sp); 290 last_java_sp = scratch; 291 } else if (!last_java_sp->is_valid()) { 292 last_java_sp = sp; 293 } 294 295 str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset())); 296 297 // last_java_fp is optional 298 if (last_java_fp->is_valid()) { 299 str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset())); 300 } 301 } 302 303 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 304 Register last_java_fp, 305 address last_java_pc, 306 Register scratch) { 307 if (last_java_pc != NULL) { 308 adr(scratch, last_java_pc); 309 } else { 310 // FIXME: This is almost never correct. We should delete all 311 // cases of set_last_Java_frame with last_java_pc=NULL and use the 312 // correct return address instead. 313 adr(scratch, pc()); 314 } 315 316 str(scratch, Address(rthread, 317 JavaThread::frame_anchor_offset() 318 + JavaFrameAnchor::last_Java_pc_offset())); 319 320 set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch); 321 } 322 323 void MacroAssembler::set_last_Java_frame(Register last_java_sp, 324 Register last_java_fp, 325 Label &L, 326 Register scratch) { 327 if (L.is_bound()) { 328 set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch); 329 } else { 330 InstructionMark im(this); 331 L.add_patch_at(code(), locator()); 332 set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch); 333 } 334 } 335 336 void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf) { 337 assert(CodeCache::find_blob(entry.target()) != NULL, 338 "destination of far call not found in code cache"); 339 if (far_branches()) { 340 lea(lr, entry); 341 if (cbuf) cbuf->set_insts_mark(); 342 bl(lr); 343 } else { 344 if (cbuf) cbuf->set_insts_mark(); 345 bl(entry); 346 } 347 } 348 349 void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) { 350 assert(CodeCache::find_blob(entry.target()) != NULL, 351 "destination of far call not found in code cache"); 352 if (far_branches()) { 353 lea(tmp, entry); 354 if (cbuf) cbuf->set_insts_mark(); 355 b(tmp); 356 } else { 357 if (cbuf) cbuf->set_insts_mark(); 358 b(entry); 359 } 360 } 361 362 void MacroAssembler::reserved_stack_check() { 363 // testing if reserved zone needs to be enabled 364 Label no_reserved_zone_enabling; 365 366 ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset())); 367 cmp(sp, rscratch1); 368 b(no_reserved_zone_enabling, Assembler::LO); 369 370 enter(); // LR and FP are live. 371 lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone)); 372 mov(c_rarg0, rthread); 373 bl(rscratch1); 374 leave(); 375 376 // We have already removed our own frame. 377 // throw_delayed_StackOverflowError will think that it's been 378 // called by our caller. 379 lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry())); 380 b(rscratch1); 381 should_not_reach_here(); 382 383 bind(no_reserved_zone_enabling); 384 } 385 386 int MacroAssembler::biased_locking_enter(Register obj_reg, 387 Register swap_reg, 388 Register tmp_reg, 389 Register tmp_reg2, 390 bool swap_reg_contains_mark, 391 Label& done, 392 Label* slow_case, 393 BiasedLockingCounters* counters) { 394 assert(UseBiasedLocking, "why call this otherwise?"); 395 396 if (PrintBiasedLockingStatistics && counters == NULL) 397 counters = BiasedLocking::counters(); 398 399 assert(tmp_reg != noreg, "must be real register"); 400 assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp_reg2); 401 assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout"); 402 Address mark_addr (obj_reg, oopDesc::mark_offset_in_bytes()); 403 Address klass_addr (obj_reg, oopDesc::klass_offset_in_bytes()); 404 405 // Biased locking 406 // See whether the lock is currently biased toward our thread and 407 // whether the epoch is still valid 408 // Note that the runtime guarantees sufficient alignment of JavaThread 409 // pointers to allow age to be placed into low bits 410 // First check to see whether biasing is even enabled for this object 411 Label cas_label; 412 int null_check_offset = -1; 413 if (!swap_reg_contains_mark) { 414 null_check_offset = offset(); 415 ldr(swap_reg, mark_addr); 416 } 417 andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place); 418 cmp(tmp_reg, markOopDesc::biased_lock_pattern); 419 b(cas_label, Assembler::NE); 420 // The bias pattern is present in the object's header. Need to check 421 // whether the bias owner and the epoch are both still current. 422 load_prototype_header(tmp_reg, obj_reg); 423 orr(tmp_reg, tmp_reg, rthread); 424 eor(tmp_reg, swap_reg, tmp_reg); 425 // andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place)); 426 bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place); 427 if (counters != NULL) { 428 Label around; 429 cbnz(tmp_reg, around); 430 atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, tmp_reg2); 431 b(done); 432 bind(around); 433 } else { 434 cbz(tmp_reg, done); 435 } 436 437 Label try_revoke_bias; 438 Label try_rebias; 439 440 // At this point we know that the header has the bias pattern and 441 // that we are not the bias owner in the current epoch. We need to 442 // figure out more details about the state of the header in order to 443 // know what operations can be legally performed on the object's 444 // header. 445 446 // If the low three bits in the xor result aren't clear, that means 447 // the prototype header is no longer biased and we have to revoke 448 // the bias on this object. 449 andr(tmp_reg2, tmp_reg, markOopDesc::biased_lock_mask_in_place); 450 cbnz(tmp_reg2, try_revoke_bias); 451 452 // Biasing is still enabled for this data type. See whether the 453 // epoch of the current bias is still valid, meaning that the epoch 454 // bits of the mark word are equal to the epoch bits of the 455 // prototype header. (Note that the prototype header's epoch bits 456 // only change at a safepoint.) If not, attempt to rebias the object 457 // toward the current thread. Note that we must be absolutely sure 458 // that the current epoch is invalid in order to do this because 459 // otherwise the manipulations it performs on the mark word are 460 // illegal. 461 andr(tmp_reg2, tmp_reg, markOopDesc::epoch_mask_in_place); 462 cbnz(tmp_reg2, try_rebias); 463 464 // The epoch of the current bias is still valid but we know nothing 465 // about the owner; it might be set or it might be clear. Try to 466 // acquire the bias of the object using an atomic operation. If this 467 // fails we will go in to the runtime to revoke the object's bias. 468 // Note that we first construct the presumed unbiased header so we 469 // don't accidentally blow away another thread's valid bias. 470 { 471 Label here; 472 mov(tmp_reg2, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place); 473 andr(swap_reg, swap_reg, tmp_reg2); 474 orr(tmp_reg, swap_reg, rthread); 475 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case); 476 // If the biasing toward our thread failed, this means that 477 // another thread succeeded in biasing it toward itself and we 478 // need to revoke that bias. The revocation will occur in the 479 // interpreter runtime in the slow case. 480 bind(here); 481 if (counters != NULL) { 482 atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()), 483 tmp_reg, tmp_reg2); 484 } 485 } 486 b(done); 487 488 bind(try_rebias); 489 // At this point we know the epoch has expired, meaning that the 490 // current "bias owner", if any, is actually invalid. Under these 491 // circumstances _only_, we are allowed to use the current header's 492 // value as the comparison value when doing the cas to acquire the 493 // bias in the current epoch. In other words, we allow transfer of 494 // the bias from one thread to another directly in this situation. 495 // 496 // FIXME: due to a lack of registers we currently blow away the age 497 // bits in this situation. Should attempt to preserve them. 498 { 499 Label here; 500 load_prototype_header(tmp_reg, obj_reg); 501 orr(tmp_reg, rthread, tmp_reg); 502 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case); 503 // If the biasing toward our thread failed, then another thread 504 // succeeded in biasing it toward itself and we need to revoke that 505 // bias. The revocation will occur in the runtime in the slow case. 506 bind(here); 507 if (counters != NULL) { 508 atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()), 509 tmp_reg, tmp_reg2); 510 } 511 } 512 b(done); 513 514 bind(try_revoke_bias); 515 // The prototype mark in the klass doesn't have the bias bit set any 516 // more, indicating that objects of this data type are not supposed 517 // to be biased any more. We are going to try to reset the mark of 518 // this object to the prototype value and fall through to the 519 // CAS-based locking scheme. Note that if our CAS fails, it means 520 // that another thread raced us for the privilege of revoking the 521 // bias of this particular object, so it's okay to continue in the 522 // normal locking code. 523 // 524 // FIXME: due to a lack of registers we currently blow away the age 525 // bits in this situation. Should attempt to preserve them. 526 { 527 Label here, nope; 528 load_prototype_header(tmp_reg, obj_reg); 529 cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, &nope); 530 bind(here); 531 532 // Fall through to the normal CAS-based lock, because no matter what 533 // the result of the above CAS, some thread must have succeeded in 534 // removing the bias bit from the object's header. 535 if (counters != NULL) { 536 atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg, 537 tmp_reg2); 538 } 539 bind(nope); 540 } 541 542 bind(cas_label); 543 544 return null_check_offset; 545 } 546 547 void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) { 548 assert(UseBiasedLocking, "why call this otherwise?"); 549 550 // Check for biased locking unlock case, which is a no-op 551 // Note: we do not have to check the thread ID for two reasons. 552 // First, the interpreter checks for IllegalMonitorStateException at 553 // a higher level. Second, if the bias was revoked while we held the 554 // lock, the object could not be rebiased toward another thread, so 555 // the bias bit would be clear. 556 ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes())); 557 andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place); 558 cmp(temp_reg, markOopDesc::biased_lock_pattern); 559 b(done, Assembler::EQ); 560 } 561 562 563 static void pass_arg0(MacroAssembler* masm, Register arg) { 564 if (c_rarg0 != arg ) { 565 masm->mov(c_rarg0, arg); 566 } 567 } 568 569 static void pass_arg1(MacroAssembler* masm, Register arg) { 570 if (c_rarg1 != arg ) { 571 masm->mov(c_rarg1, arg); 572 } 573 } 574 575 static void pass_arg2(MacroAssembler* masm, Register arg) { 576 if (c_rarg2 != arg ) { 577 masm->mov(c_rarg2, arg); 578 } 579 } 580 581 static void pass_arg3(MacroAssembler* masm, Register arg) { 582 if (c_rarg3 != arg ) { 583 masm->mov(c_rarg3, arg); 584 } 585 } 586 587 void MacroAssembler::call_VM_base(Register oop_result, 588 Register java_thread, 589 Register last_java_sp, 590 address entry_point, 591 int number_of_arguments, 592 bool check_exceptions) { 593 // determine java_thread register 594 if (!java_thread->is_valid()) { 595 java_thread = rthread; 596 } 597 598 // determine last_java_sp register 599 if (!last_java_sp->is_valid()) { 600 last_java_sp = sp; 601 } 602 603 // debugging support 604 assert(number_of_arguments >= 0 , "cannot have negative number of arguments"); 605 assert(java_thread == rthread, "unexpected register"); 606 607 assert(java_thread != oop_result , "cannot use the same register for java_thread & oop_result"); 608 assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp"); 609 610 // push java thread (becomes first argument of C function) 611 612 mov(c_rarg0, java_thread); 613 614 // set last Java frame before call 615 assert(last_java_sp != rfp, "can't use rfp"); 616 617 Label l; 618 set_last_Java_frame(last_java_sp, rfp, l, rscratch2); 619 620 621 // FIXME - Can save lr in more elegant way ? 622 //str(lr, pre(sp, -wordSize)); 623 624 // do the call, remove parameters 625 MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l); 626 627 //ldr(lr, post(sp, wordSize)); 628 629 // reset last Java frame 630 // Only interpreter should have to clear fp 631 reset_last_Java_frame(true); 632 633 // C++ interp handles this in the interpreter 634 check_and_handle_popframe(java_thread); 635 check_and_handle_earlyret(java_thread); 636 637 if (check_exceptions) { 638 // check for pending exceptions (java_thread is set upon return) 639 ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset()))); 640 Label ok; 641 cbz(rscratch2, ok); 642 643 lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry())); 644 // forward_exception uses LR to choose exception handler but LR is trashed by previous code 645 // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception) 646 bl(rscratch2); 647 bind(ok); 648 } 649 650 // get oop result if there is one and reset the value in the thread 651 if (oop_result->is_valid()) { 652 get_vm_result(oop_result, java_thread); 653 } 654 } 655 656 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) { 657 call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions); 658 } 659 660 // Maybe emit a call via a trampoline. If the code cache is small 661 // trampolines won't be emitted. 662 663 void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) { 664 assert(JavaThread::current()->is_Compiler_thread(), "just checking"); 665 assert(entry.rspec().type() == relocInfo::runtime_call_type 666 || entry.rspec().type() == relocInfo::opt_virtual_call_type 667 || entry.rspec().type() == relocInfo::static_call_type 668 || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type"); 669 670 if (cbuf) { 671 cbuf->set_insts_mark(); 672 } 673 674 if (far_branches()) { 675 // Have make trampoline such way: destination address should be raw 4 byte value, 676 // so it's patching could be done atomically. 677 relocate(entry.rspec()); 678 address start = pc(); 679 add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz); 680 ldr(r15_pc, Address(r15_pc, 4)); 681 emit_int32((uintptr_t) entry.target()); 682 // possibly pad the call to the NativeCall size to make patching happy 683 while (pc() - start < NativeCall::instruction_size) { 684 nop(); 685 } 686 assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!"); 687 } else { 688 bl(entry); 689 } 690 } 691 692 void MacroAssembler::c2bool(Register x) { 693 ands(r0, r0, 0xff); 694 mov(r0, 1, Assembler::NE); 695 } 696 697 void MacroAssembler::ic_call(address entry, jint method_index) { 698 RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index); 699 // address const_ptr = long_constant((jlong)Universe::non_oop_word()); 700 // unsigned long offset; 701 // ldr_constant(rscratch2, const_ptr); 702 movptr(rscratch2, (uintptr_t)Universe::non_oop_word()); 703 trampoline_call(Address(entry, rh)); 704 } 705 706 // Implementation of call_VM versions 707 708 void MacroAssembler::call_VM(Register oop_result, 709 address entry_point, 710 bool check_exceptions) { 711 call_VM_helper(oop_result, entry_point, 0, check_exceptions); 712 } 713 714 void MacroAssembler::call_VM(Register oop_result, 715 address entry_point, 716 Register arg_1, 717 bool check_exceptions) { 718 pass_arg1(this, arg_1); 719 call_VM_helper(oop_result, entry_point, 1, check_exceptions); 720 } 721 722 void MacroAssembler::call_VM(Register oop_result, 723 address entry_point, 724 Register arg_1, 725 Register arg_2, 726 bool check_exceptions) { 727 assert(arg_1 != c_rarg2, "smashed arg"); 728 pass_arg2(this, arg_2); 729 pass_arg1(this, arg_1); 730 call_VM_helper(oop_result, entry_point, 2, check_exceptions); 731 } 732 733 void MacroAssembler::call_VM(Register oop_result, 734 address entry_point, 735 Register arg_1, 736 Register arg_2, 737 Register arg_3, 738 bool check_exceptions) { 739 assert(arg_1 != c_rarg3, "smashed arg"); 740 assert(arg_2 != c_rarg3, "smashed arg"); 741 pass_arg3(this, arg_3); 742 743 assert(arg_1 != c_rarg2, "smashed arg"); 744 pass_arg2(this, arg_2); 745 746 pass_arg1(this, arg_1); 747 call_VM_helper(oop_result, entry_point, 3, check_exceptions); 748 } 749 750 void MacroAssembler::call_VM(Register oop_result, 751 Register last_java_sp, 752 address entry_point, 753 int number_of_arguments, 754 bool check_exceptions) { 755 call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions); 756 } 757 758 void MacroAssembler::call_VM(Register oop_result, 759 Register last_java_sp, 760 address entry_point, 761 Register arg_1, 762 bool check_exceptions) { 763 pass_arg1(this, arg_1); 764 call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions); 765 } 766 767 void MacroAssembler::call_VM(Register oop_result, 768 Register last_java_sp, 769 address entry_point, 770 Register arg_1, 771 Register arg_2, 772 bool check_exceptions) { 773 774 assert(arg_1 != c_rarg2, "smashed arg"); 775 pass_arg2(this, arg_2); 776 pass_arg1(this, arg_1); 777 call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions); 778 } 779 780 void MacroAssembler::call_VM(Register oop_result, 781 Register last_java_sp, 782 address entry_point, 783 Register arg_1, 784 Register arg_2, 785 Register arg_3, 786 bool check_exceptions) { 787 assert(arg_1 != c_rarg3, "smashed arg"); 788 assert(arg_2 != c_rarg3, "smashed arg"); 789 pass_arg3(this, arg_3); 790 assert(arg_1 != c_rarg2, "smashed arg"); 791 pass_arg2(this, arg_2); 792 pass_arg1(this, arg_1); 793 call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions); 794 } 795 796 797 void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) { 798 ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset())); 799 assert(oop_result != rscratch2, "can't be"); 800 mov(rscratch2, 0); 801 str(rscratch2, Address(java_thread, JavaThread::vm_result_offset())); 802 verify_oop(oop_result, "broken oop in call_VM_base"); 803 } 804 805 void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) { 806 ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset())); 807 assert(metadata_result != rscratch2 && 808 java_thread != rscratch2, "can't be"); 809 mov(rscratch2, 0); 810 str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset())); 811 } 812 813 void MacroAssembler::align(int modulus) { 814 while (offset() % modulus != 0) nop(); 815 } 816 817 // these are no-ops overridden by InterpreterMacroAssembler 818 819 void MacroAssembler::check_and_handle_earlyret(Register java_thread) { } 820 821 void MacroAssembler::check_and_handle_popframe(Register java_thread) { } 822 823 824 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr, 825 Register tmp, 826 int offset) { 827 intptr_t value = *delayed_value_addr; 828 if (value != 0) 829 return RegisterOrConstant(value + offset); 830 831 // load indirectly to solve generation ordering problem 832 ldr(tmp, ExternalAddress((address) delayed_value_addr)); 833 834 if (offset != 0) 835 add(tmp, tmp, offset); 836 837 return RegisterOrConstant(tmp); 838 } 839 840 841 // Look up the method for a megamorphic invokeinterface call. 842 // The target method is determined by <intf_klass, itable_index>. 843 // The receiver klass is in recv_klass. 844 // On success, the result will be in method_result, and execution falls through. 845 // On failure, execution transfers to the given label. 846 void MacroAssembler::lookup_interface_method(Register recv_klass, 847 Register intf_klass, 848 RegisterOrConstant itable_index, 849 Register method_result, 850 Register scan_temp, 851 Label& L_no_such_interface, 852 bool return_method) { 853 assert_different_registers(recv_klass, intf_klass, scan_temp); 854 assert_different_registers(method_result, intf_klass, scan_temp); 855 assert(recv_klass != method_result || !return_method, 856 "recv_klass can be destroyed when method isn't needed"); 857 858 // Compute start of first itableOffsetEntry (which is at the end of the vtable) 859 int vtable_base = in_bytes(InstanceKlass::vtable_start_offset()); 860 int itentry_off = itableMethodEntry::method_offset_in_bytes(); 861 int scan_step = itableOffsetEntry::size() * wordSize; 862 int vte_size = vtableEntry::size_in_bytes(); 863 assert(vte_size == wordSize, "else adjust times_vte_scale"); 864 865 ldr(scan_temp, Address(recv_klass, in_bytes(InstanceKlass::vtable_length_offset()))); 866 867 // %%% Could store the aligned, prescaled offset in the klassoop. 868 // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base)); 869 lea(scan_temp, Address(recv_klass, scan_temp, lsl(2))); 870 add(scan_temp, scan_temp, vtable_base); 871 872 if (return_method) { 873 // Adjust recv_klass by scaled itable_index, so we can free itable_index. 874 assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below"); 875 // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off)); 876 lea(recv_klass, itable_index.is_register() ? 877 Address(recv_klass, itable_index, lsl(2)) : 878 Address(recv_klass, itable_index.as_constant() << 2)); 879 if (itentry_off) 880 add(recv_klass, recv_klass, itentry_off); 881 } 882 883 // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) { 884 // if (scan->interface() == intf) { 885 // result = (klass + scan->offset() + itable_index); 886 // } 887 // } 888 Label search, found_method; 889 890 for (int peel = 1; peel >= 0; peel--) { 891 ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes())); 892 cmp(intf_klass, method_result); 893 894 if (peel) { 895 b(found_method, Assembler::EQ); 896 } else { 897 b(search, Assembler::NE); 898 // (invert the test to fall through to found_method...) 899 } 900 901 if (!peel) break; 902 903 bind(search); 904 905 // Check that the previous entry is non-null. A null entry means that 906 // the receiver class doesn't implement the interface, and wasn't the 907 // same as when the caller was compiled. 908 cbz(method_result, L_no_such_interface); 909 add(scan_temp, scan_temp, scan_step); 910 } 911 912 bind(found_method); 913 914 if (return_method) { 915 // Got a hit. 916 ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes())); 917 ldr(method_result, Address(recv_klass, scan_temp)); 918 } 919 } 920 921 // virtual method calling 922 void MacroAssembler::lookup_virtual_method(Register recv_klass, 923 RegisterOrConstant vtable_index, 924 Register method_result) { 925 const int base = in_bytes(InstanceKlass::vtable_start_offset()); 926 int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes(); 927 if (vtable_index.is_register()) { 928 lea(method_result, Address(recv_klass, 929 vtable_index.as_register(), 930 lsl(LogBytesPerWord))); 931 ldr(method_result, Address(method_result, vtable_offset_in_bytes)); 932 } else { 933 vtable_offset_in_bytes += vtable_index.as_constant() * wordSize; 934 if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) { 935 ldr(method_result, Address(recv_klass, vtable_offset_in_bytes)); 936 } else { 937 mov(method_result, vtable_offset_in_bytes); 938 ldr(method_result, Address(recv_klass, method_result)); 939 } 940 } 941 } 942 943 void MacroAssembler::check_klass_subtype(Register sub_klass, 944 Register super_klass, 945 Register temp_reg, 946 Label& L_success) { 947 Label L_failure; 948 check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, &L_success, &L_failure, NULL); 949 check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL); 950 bind(L_failure); 951 } 952 953 954 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass, 955 Register super_klass, 956 Register temp_reg, 957 Label* L_success, 958 Label* L_failure, 959 Label* L_slow_path, 960 RegisterOrConstant super_check_offset) { 961 assert_different_registers(sub_klass, super_klass, temp_reg); 962 bool must_load_sco = (super_check_offset.constant_or_zero() == -1); 963 if (super_check_offset.is_register()) { 964 assert_different_registers(sub_klass, super_klass, 965 super_check_offset.as_register()); 966 } else if (must_load_sco) { 967 assert(temp_reg != noreg, "supply either a temp or a register offset"); 968 } 969 970 Label L_fallthrough; 971 int label_nulls = 0; 972 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 973 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 974 if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; } 975 assert(label_nulls <= 1, "at most one NULL in the batch"); 976 977 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 978 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 979 Address super_check_offset_addr(super_klass, sco_offset); 980 981 // Hacked jmp, which may only be used just before L_fallthrough. 982 #define final_jmp(label) \ 983 if (&(label) == &L_fallthrough) { /*do nothing*/ } \ 984 else b(label) /*omit semi*/ 985 986 // If the pointers are equal, we are done (e.g., String[] elements). 987 // This self-check enables sharing of secondary supertype arrays among 988 // non-primary types such as array-of-interface. Otherwise, each such 989 // type would need its own customized SSA. 990 // We move this check to the front of the fast path because many 991 // type checks are in fact trivially successful in this manner, 992 // so we get a nicely predicted branch right at the start of the check. 993 cmp(sub_klass, super_klass); 994 b(*L_success, Assembler::EQ); 995 996 // Check the supertype display: 997 if (must_load_sco) { 998 ldr(temp_reg, super_check_offset_addr); 999 super_check_offset = RegisterOrConstant(temp_reg); 1000 } 1001 Address super_check_addr(sub_klass, super_check_offset); 1002 ldr(rscratch1, super_check_addr); 1003 cmp(super_klass, rscratch1); // load displayed supertype 1004 1005 // This check has worked decisively for primary supers. 1006 // Secondary supers are sought in the super_cache ('super_cache_addr'). 1007 // (Secondary supers are interfaces and very deeply nested subtypes.) 1008 // This works in the same check above because of a tricky aliasing 1009 // between the super_cache and the primary super display elements. 1010 // (The 'super_check_addr' can address either, as the case requires.) 1011 // Note that the cache is updated below if it does not help us find 1012 // what we need immediately. 1013 // So if it was a primary super, we can just fail immediately. 1014 // Otherwise, it's the slow path for us (no success at this point). 1015 1016 if (super_check_offset.is_register()) { 1017 b(*L_success, Assembler::EQ); 1018 cmp(super_check_offset.as_register(), sc_offset); 1019 if (L_failure == &L_fallthrough) { 1020 b(*L_slow_path, Assembler::EQ); 1021 } else { 1022 b(*L_failure, Assembler::NE); 1023 final_jmp(*L_slow_path); 1024 } 1025 } else if (super_check_offset.as_constant() == sc_offset) { 1026 // Need a slow path; fast failure is impossible. 1027 if (L_slow_path == &L_fallthrough) { 1028 b(*L_success, Assembler::EQ); 1029 } else { 1030 b(*L_slow_path, Assembler::NE); 1031 final_jmp(*L_success); 1032 } 1033 } else { 1034 // No slow path; it's a fast decision. 1035 if (L_failure == &L_fallthrough) { 1036 b(*L_success, Assembler::EQ); 1037 } else { 1038 b(*L_failure, Assembler::NE); 1039 final_jmp(*L_success); 1040 } 1041 } 1042 1043 bind(L_fallthrough); 1044 1045 #undef final_jmp 1046 } 1047 1048 // These two are taken from x86, but they look generally useful 1049 1050 // scans count pointer sized words at [addr] for occurence of value, 1051 // generic 1052 void MacroAssembler::repne_scan(Register addr, Register value, Register count, 1053 Register scratch) { 1054 Label loop, fail, found; 1055 cmp(count, 0); 1056 b(fail, EQ); 1057 1058 bind(loop); 1059 ldr(scratch, post(addr, wordSize)); 1060 cmp(value, scratch); 1061 b(found, EQ); 1062 subs(count, count, 1); 1063 b(loop, NE); 1064 1065 bind(fail); 1066 cmp(sp, 0); // sp never zero 1067 bind(found); 1068 } 1069 1070 // Form an address from base + offset in Rd. Rd may or may 1071 // not actually be used: you must use the Address that is returned. 1072 // It is up to you to ensure that the shift provided matches the size 1073 // of your data. 1074 Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) { 1075 // form_address result should only be used together with ldr/str instructions 1076 // otherwise please provide exact type instead of IDT_INT or apply safe_for() 1077 if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT)) 1078 // It fits; no need for any heroics 1079 return Address(base, byte_offset); 1080 1081 // See if we can do this with two 12-bit offsets 1082 { 1083 unsigned long masked_offset = byte_offset & ~0xfff; 1084 if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT) 1085 && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) { 1086 add(Rd, base, masked_offset); 1087 byte_offset -= masked_offset; 1088 return Address(Rd, byte_offset); 1089 } 1090 } 1091 1092 // Do it the hard way 1093 mov(Rd, byte_offset); 1094 add(Rd, base, Rd); 1095 return Address(Rd); 1096 } 1097 1098 // scans count 4 byte words at [addr] for occurence of value, 1099 // generic 1100 /*void MacroAssembler::repne_scanw(Register addr, Register value, Register count, 1101 Register scratch) { 1102 Label Lloop, Lexit; 1103 cbz(count, Lexit); 1104 bind(Lloop); 1105 ldr(scratch, post(addr, wordSize)); 1106 cmp(value, scratch); 1107 b(Lexit, EQ); 1108 sub(count, count, 1); 1109 cbnz(count, Lloop); 1110 bind(Lexit); 1111 }*/ 1112 1113 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass, 1114 Register super_klass, 1115 Register temp_reg, 1116 Register temp2_reg, 1117 Label* L_success, 1118 Label* L_failure, 1119 bool set_cond_codes) { 1120 assert_different_registers(sub_klass, super_klass, temp_reg); 1121 if (temp2_reg != noreg) 1122 assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1); 1123 #define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg) 1124 1125 Label L_fallthrough; 1126 int label_nulls = 0; 1127 if (L_success == NULL) { L_success = &L_fallthrough; label_nulls++; } 1128 if (L_failure == NULL) { L_failure = &L_fallthrough; label_nulls++; } 1129 assert(label_nulls <= 1, "at most one NULL in the batch"); 1130 1131 // a couple of useful fields in sub_klass: 1132 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 1133 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 1134 Address secondary_supers_addr(sub_klass, ss_offset); 1135 Address super_cache_addr( sub_klass, sc_offset); 1136 1137 BLOCK_COMMENT("check_klass_subtype_slow_path"); 1138 1139 // Do a linear scan of the secondary super-klass chain. 1140 // This code is rarely used, so simplicity is a virtue here. 1141 // The repne_scan instruction uses fixed registers, which we must spill. 1142 // Don't worry too much about pre-existing connections with the input regs. 1143 1144 assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super) 1145 assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter) 1146 1147 RegSet pushed_registers; 1148 if (!IS_A_TEMP(r2)) pushed_registers += r2; 1149 if (!IS_A_TEMP(r14)) pushed_registers += r14; 1150 1151 if (super_klass != r0) { 1152 if (!IS_A_TEMP(r0)) pushed_registers += r0; 1153 } 1154 1155 push(pushed_registers, sp); 1156 1157 // Get super_klass value into r0 (even if it was in r5 or r2). 1158 if (super_klass != r0) { 1159 mov(r0, super_klass); 1160 } 1161 1162 #ifndef PRODUCT 1163 mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr); 1164 Address pst_counter_addr(rscratch2); 1165 ldr(rscratch1, pst_counter_addr); 1166 add(rscratch1, rscratch1, 1); 1167 str(rscratch1, pst_counter_addr); 1168 #endif //PRODUCT 1169 1170 // We will consult the secondary-super array. 1171 ldr(r14, secondary_supers_addr); 1172 // Load the array length. 1173 ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes())); 1174 // Skip to start of data. 1175 add(r14, r14, Array<Klass*>::base_offset_in_bytes()); 1176 1177 cmp(sp, 0); // Clear Z flag; SP is never zero 1178 // Scan R2 words at [R14] for an occurrence of R0. 1179 // Set NZ/Z based on last compare. 1180 repne_scan(r14, r0, r2, rscratch1); 1181 1182 // Unspill the temp. registers: 1183 pop(pushed_registers, sp); 1184 1185 b(*L_failure, Assembler::NE); 1186 1187 // Success. Cache the super we found and proceed in triumph. 1188 str(super_klass, super_cache_addr); 1189 1190 if (L_success != &L_fallthrough) { 1191 b(*L_success); 1192 } 1193 1194 #undef IS_A_TEMP 1195 1196 bind(L_fallthrough); 1197 } 1198 1199 1200 void MacroAssembler::verify_oop(Register reg, const char* s) { 1201 if (!VerifyOops) return; 1202 1203 // Pass register number to verify_oop_subroutine 1204 const char* b = NULL; 1205 { 1206 ResourceMark rm; 1207 stringStream ss; 1208 ss.print("verify_oop: %s: %s", reg->name(), s); 1209 b = code_string(ss.as_string()); 1210 } 1211 BLOCK_COMMENT("verify_oop {"); 1212 1213 stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); 1214 1215 mov(r0, reg); 1216 mov(rscratch1, (address)b); 1217 mrs(r1); 1218 1219 // call indirectly to solve generation ordering problem 1220 reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp); 1221 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1222 ldr(rscratch2, Address(rscratch2)); 1223 bl(rscratch2); 1224 reg_printf("Verify oop exit, sp = %p, rfp = %p\n", sp, rfp); 1225 1226 msr(r1); 1227 ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); 1228 1229 BLOCK_COMMENT("} verify_oop"); 1230 } 1231 1232 void MacroAssembler::verify_oop_addr(Address addr, const char* s) { 1233 if (!VerifyOops) return; 1234 1235 const char* b = NULL; 1236 { 1237 ResourceMark rm; 1238 stringStream ss; 1239 ss.print("verify_oop_addr: %s", s); 1240 b = code_string(ss.as_string()); 1241 } 1242 BLOCK_COMMENT("verify_oop_addr {"); 1243 1244 stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); 1245 mrs(r1); 1246 1247 // addr may contain sp so we will have to adjust it based on the 1248 // pushes that we just did. 1249 if (addr.uses(sp)) { 1250 lea(r0, addr); 1251 ldr(r0, Address(r0, 5 * wordSize)); 1252 } else { 1253 ldr(r0, addr); 1254 } 1255 mov(rscratch1, (address)b); 1256 1257 // call indirectly to solve generation ordering problem 1258 lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address())); 1259 ldr(rscratch2, Address(rscratch2)); 1260 bl(rscratch2); 1261 1262 msr(r1); 1263 ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits()); 1264 1265 BLOCK_COMMENT("} verify_oop_addr"); 1266 } 1267 1268 Address MacroAssembler::argument_address(RegisterOrConstant arg_slot, 1269 int extra_slot_offset) { 1270 // cf. TemplateTable::prepare_invoke(), if (load_receiver). 1271 int stackElementSize = Interpreter::stackElementSize; 1272 int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0); 1273 #ifdef ASSERT 1274 int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1); 1275 assert(offset1 - offset == stackElementSize, "correct arithmetic"); 1276 #endif 1277 if (arg_slot.is_constant()) { 1278 return Address(sp, arg_slot.as_constant() * stackElementSize 1279 + offset); 1280 } else { 1281 add(rscratch1, sp, arg_slot.as_register(), 1282 lsl(exact_log2(stackElementSize))); 1283 return Address(rscratch1, offset); 1284 } 1285 } 1286 1287 void MacroAssembler::call_VM_leaf_base(address entry_point, 1288 int number_of_arguments, 1289 Label *retaddr) { 1290 Label E, L; 1291 1292 //FIXME Do this alignment in a more elegant way 1293 mov(rscratch2, sp); 1294 sub(sp, sp, wordSize); 1295 bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes 1296 str(rscratch2, Address(sp)); 1297 1298 // FIXME Do we need to preserve rscratch2? 1299 //str(rscratch2, Address(pre(sp, -wordSize))); 1300 1301 mov(rscratch2, entry_point); 1302 reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp); 1303 bl(rscratch2); 1304 if (retaddr) 1305 bind(*retaddr); 1306 reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp); 1307 1308 //ldr(rscratch2, Address(post(sp, wordSize))); 1309 1310 //Undo alignment 1311 ldr(sp, Address(sp)); 1312 1313 maybe_isb(); 1314 } 1315 1316 void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) { 1317 call_VM_leaf_base(entry_point, number_of_arguments); 1318 } 1319 1320 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) { 1321 pass_arg0(this, arg_0); 1322 call_VM_leaf_base(entry_point, 1); 1323 } 1324 1325 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1326 pass_arg0(this, arg_0); 1327 pass_arg1(this, arg_1); 1328 call_VM_leaf_base(entry_point, 2); 1329 } 1330 1331 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, 1332 Register arg_1, Register arg_2) { 1333 pass_arg0(this, arg_0); 1334 pass_arg1(this, arg_1); 1335 pass_arg2(this, arg_2); 1336 call_VM_leaf_base(entry_point, 3); 1337 } 1338 1339 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) { 1340 pass_arg0(this, arg_0); 1341 MacroAssembler::call_VM_leaf_base(entry_point, 1); 1342 } 1343 1344 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) { 1345 1346 assert(arg_0 != c_rarg1, "smashed arg"); 1347 pass_arg1(this, arg_1); 1348 pass_arg0(this, arg_0); 1349 MacroAssembler::call_VM_leaf_base(entry_point, 2); 1350 } 1351 1352 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) { 1353 assert(arg_0 != c_rarg2, "smashed arg"); 1354 assert(arg_1 != c_rarg2, "smashed arg"); 1355 pass_arg2(this, arg_2); 1356 assert(arg_0 != c_rarg1, "smashed arg"); 1357 pass_arg1(this, arg_1); 1358 pass_arg0(this, arg_0); 1359 MacroAssembler::call_VM_leaf_base(entry_point, 3); 1360 } 1361 1362 void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) { 1363 assert(arg_0 != c_rarg3, "smashed arg"); 1364 assert(arg_1 != c_rarg3, "smashed arg"); 1365 assert(arg_2 != c_rarg3, "smashed arg"); 1366 pass_arg3(this, arg_3); 1367 assert(arg_0 != c_rarg2, "smashed arg"); 1368 assert(arg_1 != c_rarg2, "smashed arg"); 1369 pass_arg2(this, arg_2); 1370 assert(arg_0 != c_rarg1, "smashed arg"); 1371 pass_arg1(this, arg_1); 1372 pass_arg0(this, arg_0); 1373 MacroAssembler::call_VM_leaf_base(entry_point, 4); 1374 } 1375 1376 // Clobbers rscratch1 1377 void MacroAssembler::null_check(Register reg, int offset) { 1378 if (needs_explicit_null_check(offset)) { 1379 // provoke OS NULL exception if reg = NULL by 1380 // accessing M[reg] w/o changing any registers 1381 // NOTE: this is plenty to provoke a segv 1382 reg_printf("Generating OS check null with ptr = %p\n", reg); 1383 assert(reg != rscratch1, "can't be"); 1384 ldr(rscratch1, Address(reg)); 1385 } else { 1386 // nothing to do, (later) access of M[reg + offset] 1387 // will provoke OS NULL exception if reg = NULL 1388 } 1389 } 1390 1391 // MacroAssembler protected routines needed to implement 1392 // public methods 1393 1394 void MacroAssembler::mov(Register r, Address dest, Condition cond) { 1395 code_section()->relocate(pc(), dest.rspec()); 1396 uint32_t imm32 = (uint32_t)dest.target(); 1397 movptr(r, imm32, cond); 1398 } 1399 1400 // Move a constant pointer into r. In aarch32 address space 1401 // is 32 bits in size and so a pointer can be encoded in two mov 1402 // instructions. 1403 void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) { 1404 #ifndef PRODUCT 1405 { 1406 char buffer[64]; 1407 snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32); 1408 block_comment(buffer); 1409 } 1410 #endif 1411 Assembler::mov_immediate32(r, imm32, cond, false); 1412 } 1413 1414 void MacroAssembler::ret(Register reg) { 1415 assert(reg == lr, "Can do return only to LR"); 1416 b(lr); 1417 } 1418 1419 void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) { 1420 Label retry_load; 1421 bind(retry_load); 1422 // flush and load exclusive from the memory location 1423 ldrex(tmp, counter_addr); 1424 add(tmp, tmp, 1); 1425 // if we store+flush with no intervening write tmp wil be zero 1426 strex(tmp, tmp, counter_addr); 1427 cmp(tmp, 0); 1428 b(retry_load, Assembler::NE); 1429 } 1430 1431 1432 // MacroAssembler routines found actually to be needed 1433 1434 void MacroAssembler::push(Register src) 1435 { 1436 str(src, Address(pre(sp, -1 * wordSize))); 1437 } 1438 1439 void MacroAssembler::pop(Register dst) 1440 { 1441 ldr(dst, Address(post(sp, 1 * wordSize))); 1442 } 1443 1444 // Note: load_unsigned_short used to be called load_unsigned_word. 1445 int MacroAssembler::load_unsigned_short(Register dst, Address src) { 1446 int off = offset(); 1447 ldrh(dst, src); 1448 return off; 1449 } 1450 1451 int MacroAssembler::load_unsigned_byte(Register dst, Address src) { 1452 int off = offset(); 1453 ldrb(dst, src); 1454 return off; 1455 } 1456 1457 int MacroAssembler::load_signed_short(Register dst, Address src) { 1458 int off = offset(); 1459 ldrsh(dst, src); 1460 return off; 1461 } 1462 1463 int MacroAssembler::load_signed_byte(Register dst, Address src) { 1464 int off = offset(); 1465 ldrsb(dst, src); 1466 return off; 1467 } 1468 1469 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) { 1470 switch (size_in_bytes) { 1471 //case 8: ldr(dst, src); break; 1472 case 4: ldr(dst, src); break; 1473 case 2: is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break; 1474 case 1: is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break; 1475 default: ShouldNotReachHere(); 1476 } 1477 } 1478 1479 void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) { 1480 switch (size_in_bytes) { 1481 //case 8: str(src, dst); break; 1482 case 4: str(src, dst); break; 1483 case 2: strh(src, dst); break; 1484 case 1: strb(src, dst); break; 1485 default: ShouldNotReachHere(); 1486 } 1487 } 1488 1489 void MacroAssembler::decrement(Register reg, int value) { 1490 if (value < 0) { 1491 increment(reg, -value); 1492 return; 1493 } 1494 if (value == 0) { 1495 return; 1496 } 1497 if (operand_valid_for_add_sub_immediate(value)) { 1498 sub(reg, reg, value); 1499 return; 1500 } 1501 assert(reg != rscratch2, "invalid register for decrement"); 1502 mov(rscratch2, (unsigned int) value); 1503 sub(reg, reg, rscratch2); 1504 } 1505 1506 void MacroAssembler::decrement(Address dst, int value) { 1507 assert(!dst.uses(rscratch1), "invalid address for decrement"); 1508 ldr(rscratch1, dst); 1509 decrement(rscratch1, value); 1510 str(rscratch1, dst); 1511 } 1512 1513 void MacroAssembler::increment(Register reg, int value) { 1514 if (value < 0) { 1515 decrement(reg, -value); 1516 return; 1517 } 1518 if (value == 0) { 1519 return; 1520 } 1521 if (operand_valid_for_add_sub_immediate(value)) { 1522 add(reg, reg, value); 1523 return; 1524 } 1525 assert(reg != rscratch2, "invalid register for increment"); 1526 mov(rscratch2, (unsigned int) value); 1527 add(reg, reg, rscratch2); 1528 } 1529 1530 void MacroAssembler::increment(Address dst, int value) { 1531 assert(!dst.uses(rscratch1), "invalid address for increment"); 1532 ldr(rscratch1, dst); 1533 increment(rscratch1, value); 1534 str(rscratch1, dst); 1535 } 1536 1537 // Loads and stores everything except the pc and sp 1538 void MacroAssembler::pusha() { 1539 unsigned regset = 0b0101111111111111; 1540 stmdb(sp, regset); 1541 } 1542 void MacroAssembler::popa() { 1543 unsigned regset = 0b0101111111111111; 1544 ldmia(sp, regset); 1545 } 1546 1547 static void multiple_reg_check(unsigned int bitset, Register stack) { 1548 const unsigned int pcbit = 1 << r15_pc->encoding(); 1549 const unsigned int lrbit = 1 << lr->encoding(); 1550 const unsigned int spbit = 1 << sp->encoding(); 1551 const unsigned int stackbit = 1 << stack->encoding(); 1552 assert(!(bitset & spbit), "The SP can be in the list. However, " 1553 "ARM deprecates using these instructions with SP in the list."); 1554 assert(!(bitset & pcbit) || !(bitset & lrbit), 1555 "ARM deprecates using these instructions with both " 1556 "the LR and the PC in the list."); 1557 assert(!(bitset & stackbit), "Instructions with the base register " 1558 "in the list and ! specified are only available before ARMv7, " 1559 "and ARM deprecates the use of such instructions. " 1560 "The value of the base register after such an instruction is UNKNOWN"); 1561 } 1562 1563 // Push lots of registers in the bit set supplied. Don't push sp. 1564 // Return the number of words pushed 1565 int MacroAssembler::push(unsigned int bitset, Register stack) { 1566 multiple_reg_check(bitset, stack); 1567 unsigned bc = bitset, count = 0, i; 1568 for(i = 0; i <= 15; i++) { 1569 if (1 & bc) count++; 1570 bc >>= 1; 1571 } 1572 // TODO Also why did it only do even quantities before? 1573 stmdb(stack, bitset); 1574 return count; 1575 } 1576 1577 int MacroAssembler::pop(unsigned int bitset, Register stack) { 1578 multiple_reg_check(bitset, stack); 1579 unsigned bc = bitset, count = 0, i; 1580 for(i = 0; i <= 15; i++) { 1581 if (1 & bc) count++; 1582 bc >>= 1; 1583 } 1584 // TODO Also why did it only do even quantities before? 1585 ldmia(stack, bitset); 1586 return count; 1587 } 1588 1589 void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) { 1590 Label done, not_weak; 1591 cbz(value, done); // Use NULL as-is. 1592 1593 STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u); 1594 tbz(value, 0, not_weak); // Test for jweak tag. 1595 1596 // Resolve jweak. 1597 1598 access_load_word_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF, 1599 value, Address(value, -JNIHandles::weak_tag_value), tmp, noreg); 1600 verify_oop(value); 1601 b(done); 1602 1603 1604 bind(not_weak); 1605 // Resolve (untagged) jobject. 1606 access_load_word_at(T_OBJECT, IN_NATIVE, value, Address(value), tmp, noreg); 1607 verify_oop(value); 1608 bind(done); 1609 } 1610 1611 void MacroAssembler::stop(const char* msg) { 1612 pusha(); 1613 // Save old sp value 1614 add(rscratch2, sp, 14 * wordSize); 1615 str(rscratch2, Address(pre(sp, -4))); 1616 mov(c_rarg0, (address)msg); 1617 mov(c_rarg1, r15_pc); 1618 sub(c_rarg1, c_rarg1, 8); // Restore to actual value 1619 mov(c_rarg2, sp); 1620 mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32)); 1621 bl(c_rarg3); 1622 hlt(0); 1623 } 1624 1625 void MacroAssembler::unimplemented(const char* what) { 1626 const char* buf = NULL; 1627 { 1628 ResourceMark rm; 1629 stringStream ss; 1630 ss.print("unimplemented: %s", what); 1631 buf = code_string(ss.as_string()); 1632 } 1633 stop(buf); 1634 } 1635 1636 // this simulates the behaviour of the x86 cmpxchg instruction using a 1637 // load linked/store conditional pair. we use the acquire/release 1638 // versions of these instructions so that we flush pending writes as 1639 // per Java semantics. 1640 1641 // n.b the x86 version assumes the old value to be compared against is 1642 // in rax and updates rax with the value located in memory if the 1643 // cmpxchg fails. we supply a register for the old value explicitly 1644 1645 // the aarch32 load linked/store conditional instructions do not 1646 // accept an offset. so, unlike x86, we must provide a plain register 1647 // to identify the memory word to be compared/exchanged rather than a 1648 // register+offset Address. 1649 1650 void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp, 1651 Label &succeed, Label *fail) { 1652 // oldv holds comparison value 1653 // newv holds value to write in exchange 1654 // addr identifies memory word to compare against/update 1655 // tmp returns 0/1 for success/failure 1656 Label retry_load, nope; 1657 1658 bind(retry_load); 1659 // flush and load exclusive from the memory location 1660 // and fail if it is not what we expect 1661 ldrex(tmp, addr); 1662 cmp(tmp, oldv); 1663 b(nope, Assembler::NE); 1664 // if we store+flush with no intervening write tmp wil be zero 1665 strex(tmp, newv, addr); 1666 cmp(tmp, 0); 1667 b(succeed, Assembler::EQ); 1668 // retry so we only ever return after a load fails to compare 1669 // ensures we don't return a stale value after a failed write. 1670 b(retry_load); 1671 // if the memory word differs we return it in oldv and signal a fail 1672 bind(nope); 1673 membar(AnyAny); 1674 mov(oldv, tmp); 1675 if (fail) 1676 b(*fail); 1677 } 1678 1679 void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp, 1680 Label &succeed, Label *fail) { 1681 assert(oopDesc::mark_offset_in_bytes() == 0, "assumption"); 1682 cmpxchgptr(oldv, newv, obj, tmp, succeed, fail); 1683 } 1684 1685 void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp, 1686 Label &succeed, Label *fail) { 1687 // oldv holds comparison value 1688 // newv holds value to write in exchange 1689 // addr identifies memory word to compare against/update 1690 // tmp returns 0/1 for success/failure 1691 Label retry_load, nope; 1692 1693 bind(retry_load); 1694 // flush and load exclusive from the memory location 1695 // and fail if it is not what we expect 1696 ldrex(tmp, addr); 1697 cmp(tmp, oldv); 1698 b(nope, Assembler::NE); 1699 // if we store+flush with no intervening write tmp wil be zero 1700 strex(tmp, newv, addr); 1701 cmp(tmp, 0); 1702 b(succeed, Assembler::EQ); 1703 // retry so we only ever return after a load fails to compare 1704 // ensures we don't return a stale value after a failed write. 1705 b(retry_load); 1706 // if the memory word differs we return it in oldv and signal a fail 1707 bind(nope); 1708 membar(AnyAny); 1709 mov(oldv, tmp); 1710 if (fail) 1711 b(*fail); 1712 } 1713 1714 #ifndef PRODUCT 1715 extern "C" void findpc(intptr_t x); 1716 #endif 1717 1718 void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[]) 1719 { 1720 print_unseen_bytecodes(); 1721 // In order to get locks to work, we need to fake a in_VM state 1722 if (ShowMessageBoxOnError) { 1723 JavaThread* thread = JavaThread::current(); 1724 JavaThreadState saved_state = thread->thread_state(); 1725 thread->set_thread_state(_thread_in_vm); 1726 #ifndef PRODUCT 1727 if (CountBytecodes || TraceBytecodes || StopInterpreterAt) { 1728 ttyLocker ttyl; 1729 BytecodeCounter::print(); 1730 } 1731 #endif 1732 if (os::message_box(msg, "Execution stopped, print registers?")) { 1733 ttyLocker ttyl; 1734 tty->print_cr(" pc = 0x%016x", pc); 1735 #ifndef PRODUCT 1736 tty->cr(); 1737 findpc(pc); 1738 tty->cr(); 1739 #endif 1740 tty->print_cr("THIS IS WRONG!"); 1741 tty->print_cr(" r0 = 0x%016x", regs[0]); 1742 tty->print_cr(" r1 = 0x%016x", regs[1]); 1743 tty->print_cr(" r2 = 0x%016x", regs[2]); 1744 tty->print_cr(" r3 = 0x%016x", regs[3]); 1745 tty->print_cr(" r4 = 0x%016x", regs[4]); 1746 tty->print_cr(" r5 = 0x%016x", regs[5]); 1747 tty->print_cr(" r6 = 0x%016x", regs[6]); 1748 tty->print_cr(" r7 = 0x%016x", regs[7]); 1749 tty->print_cr(" r8 = 0x%016x", regs[8]); 1750 tty->print_cr(" r9 = 0x%016x", regs[9]); 1751 tty->print_cr("r10 = 0x%016x", regs[10]); 1752 tty->print_cr("r11 = 0x%016x", regs[11]); 1753 tty->print_cr("r12 = 0x%016x", regs[12]); 1754 tty->print_cr("r13 = 0x%016x", regs[13]); 1755 tty->print_cr("r14 = 0x%016x", regs[14]); 1756 tty->print_cr("r15 = 0x%016x", regs[15]); 1757 BREAKPOINT; 1758 } 1759 ThreadStateTransition::transition(thread, _thread_in_vm, saved_state); 1760 } else { 1761 { 1762 ttyLocker ttyl; 1763 ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg); 1764 ::tty->print_cr(" r0 [ arg0 ] = 0x%08x", regs[1]); 1765 ::tty->print_cr(" r1 [ arg1 ] = 0x%08x", regs[2]); 1766 ::tty->print_cr(" r2 [ arg2 ] = 0x%08x", regs[3]); 1767 ::tty->print_cr(" r3 [ arg3 ] = 0x%08x", regs[4]); 1768 ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]); 1769 ::tty->print_cr(" r5 [ rbcp ] = 0x%08x", regs[6]); 1770 ::tty->print_cr(" r6 [ rlocals ] = 0x%08x", regs[7]); 1771 ::tty->print_cr(" r7 [ rcpool ] = 0x%08x", regs[8]); 1772 ::tty->print_cr(" r8 [ rmethod ] = 0x%08x", regs[9]); 1773 ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]); 1774 ::tty->print_cr("r10 [ rthread ] = 0x%08x", regs[11]); 1775 ::tty->print_cr("r11 [ rfp ] = 0x%08x", regs[12]); 1776 ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]); 1777 ::tty->print_cr("r13 [ sp ] = 0x%08x", regs[0]); 1778 ::tty->print_cr("r14 [ lr ] = 0x%08x", regs[14]); 1779 ::tty->print_cr("r15 [ pc ] = 0x%08x", pc); 1780 } 1781 assert(false, "DEBUG MESSAGE: %s", msg); 1782 } 1783 } 1784 1785 void MacroAssembler::push_call_clobbered_registers() { 1786 push(RegSet::range(r0, r3), sp); 1787 if(hasFPU()) { 1788 const int nfloat = 16; // number of callee-saved 32-bit float registers 1789 vstmdb_f64(sp, (1 << nfloat/2) - 1); 1790 } 1791 } 1792 1793 void MacroAssembler::pop_call_clobbered_registers() { 1794 if(hasFPU()) { 1795 const int nfloat = 16; // number of callee-saved 32-bit float registers 1796 vldmia_f64(sp, (1 << nfloat/2) - 1); 1797 } 1798 pop(RegSet::range(r0, r3), sp); 1799 } 1800 1801 void MacroAssembler::push_CPU_state() { 1802 // if fix this, update also RegisterSaved::save_live_registers and it's map 1803 push(0x5fff, sp); // integer registers except sp & (aarch32 pc) 1804 1805 if(hasFPU()) { 1806 const int nfloat = FPUStateSizeInWords / 2; // saved by pairs 1807 vstmdb_f64(sp, (1 << nfloat) - 1); 1808 } else { 1809 sub(sp, sp, FPUStateSizeInWords * wordSize); 1810 } 1811 } 1812 1813 void MacroAssembler::pop_CPU_state() { 1814 if(hasFPU()) { 1815 const int nfloat = FloatRegisterImpl::number_of_registers / 2; 1816 vldmia_f64(sp, (1 << nfloat) - 1); 1817 } else { 1818 add(sp, sp, FPUStateSizeInWords * wordSize); 1819 } 1820 1821 pop(0x5fff, sp); // integer registers except sp & (aarch32 pc) 1822 } 1823 1824 // appears this needs to round up! 1825 void MacroAssembler::round_to(Register reg, int modulus) { 1826 // from x86 1827 add(reg, reg, modulus - 1); 1828 bic(reg, reg, modulus - 1); // and( reg, -modulus) 1829 } 1830 1831 SkipIfEqual::SkipIfEqual( 1832 MacroAssembler* masm, const bool* flag_addr, bool value) { 1833 _masm = masm; 1834 _masm->mov(rscratch1, ExternalAddress((address)flag_addr)); 1835 _masm->ldrb(rscratch1, rscratch1); 1836 _masm->cmp(rscratch1, 0); 1837 _masm->b(_label, value ? Assembler::NE : Assembler::EQ); 1838 } 1839 1840 SkipIfEqual::~SkipIfEqual() { 1841 _masm->bind(_label); 1842 } 1843 1844 void MacroAssembler::cmpptr(Register src1, Address src2) { 1845 mov(rscratch1, src2); 1846 ldr(rscratch1, Address(rscratch1)); 1847 cmp(src1, rscratch1); 1848 } 1849 1850 void MacroAssembler::cmpoop(Register obj1, Register obj2) { 1851 BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1852 bs->obj_equals(this, obj1, obj2); 1853 } 1854 1855 void MacroAssembler::load_klass(Register dst, Register src) { 1856 ldr(dst, Address(src, oopDesc::klass_offset_in_bytes())); 1857 } 1858 1859 // ((OopHandle)result).resolve(); 1860 void MacroAssembler::resolve_oop_handle(Register result, Register tmp) { 1861 // OopHandle::resolve is an indirection. 1862 access_load_word_at(T_OBJECT, IN_NATIVE, result, Address(result), tmp, noreg); 1863 } 1864 1865 void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) { 1866 const int mirror_offset = in_bytes(Klass::java_mirror_offset()); 1867 ldr(dst, Address(rmethod, Method::const_offset())); 1868 ldr(dst, Address(dst, ConstMethod::constants_offset())); 1869 ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes())); 1870 ldr(dst, Address(dst, mirror_offset)); 1871 resolve_oop_handle(dst, tmp); 1872 } 1873 1874 void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) { 1875 ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes())); 1876 cmp(trial_klass, tmp); 1877 } 1878 1879 void MacroAssembler::load_prototype_header(Register dst, Register src) { 1880 load_klass(dst, src); 1881 ldr(dst, Address(dst, Klass::prototype_header_offset())); 1882 } 1883 1884 void MacroAssembler::store_klass(Register dst, Register src) { 1885 str(src, Address(dst, oopDesc::klass_offset_in_bytes())); 1886 } 1887 1888 void MacroAssembler::store_klass_gap(Register dst, Register src) { } 1889 1890 void MacroAssembler::access_load_word_at(BasicType type, DecoratorSet decorators, 1891 Register dst, Address src, 1892 Register tmp1, Register thread_tmp) { 1893 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1894 decorators = AccessInternal::decorator_fixup(decorators); 1895 bool as_raw = (decorators & AS_RAW) != 0; 1896 if (as_raw) { 1897 bs->BarrierSetAssembler::load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); 1898 } else { 1899 bs->load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); 1900 } 1901 } 1902 1903 void MacroAssembler::access_store_word_at(BasicType type, DecoratorSet decorators, 1904 Address dst, Register src, 1905 Register tmp1, Register thread_tmp) { 1906 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1907 decorators = AccessInternal::decorator_fixup(decorators); 1908 bool as_raw = (decorators & AS_RAW) != 0; 1909 if (as_raw) { 1910 bs->BarrierSetAssembler::store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); 1911 } else { 1912 bs->store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp); 1913 } 1914 } 1915 1916 void MacroAssembler::access_load_tos_at(BasicType type, DecoratorSet decorators, 1917 Address src, 1918 Register tmp1, Register thread_tmp) { 1919 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1920 decorators = AccessInternal::decorator_fixup(decorators); 1921 bool as_raw = (decorators & AS_RAW) != 0; 1922 if (as_raw) { 1923 bs->BarrierSetAssembler::load_tos_at(this, decorators, type, src, tmp1, thread_tmp); 1924 } else { 1925 bs->load_tos_at(this, decorators, type, src, tmp1, thread_tmp); 1926 } 1927 } 1928 1929 void MacroAssembler::access_store_tos_at(BasicType type, DecoratorSet decorators, 1930 Address dst, 1931 Register tmp1, Register thread_tmp) { 1932 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1933 decorators = AccessInternal::decorator_fixup(decorators); 1934 bool as_raw = (decorators & AS_RAW) != 0; 1935 if (as_raw) { 1936 bs->BarrierSetAssembler::store_tos_at(this, decorators, type, dst, tmp1, thread_tmp); 1937 } else { 1938 bs->store_tos_at(this, decorators, type, dst, tmp1, thread_tmp); 1939 } 1940 } 1941 1942 void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1, 1943 Register thread_tmp, DecoratorSet decorators) { 1944 access_load_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 1945 } 1946 1947 void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1, 1948 Register thread_tmp, DecoratorSet decorators) { 1949 access_load_word_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp); 1950 } 1951 1952 void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1, 1953 Register thread_tmp, DecoratorSet decorators) { 1954 access_store_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp); 1955 } 1956 1957 // Used for storing NULLs. 1958 void MacroAssembler::store_heap_oop_null(Address dst, Register tmp) { 1959 access_store_word_at(T_OBJECT, IN_HEAP, dst, noreg, tmp, noreg); 1960 } 1961 1962 Address MacroAssembler::allocate_metadata_address(Metadata* obj) { 1963 assert(oop_recorder() != NULL, "this assembler needs a Recorder"); 1964 int index = oop_recorder()->allocate_metadata_index(obj); 1965 RelocationHolder rspec = metadata_Relocation::spec(index); 1966 return Address((address)obj, rspec); 1967 } 1968 1969 // Move an oop into a register. immediate is true if we want 1970 // immediate instrcutions, i.e. we are not going to patch this 1971 // instruction while the code is being executed by another thread. In 1972 // that case we can use move immediates rather than the constant pool. 1973 void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) { 1974 int oop_index; 1975 if (obj == NULL) { 1976 oop_index = oop_recorder()->allocate_oop_index(obj); 1977 } else { 1978 #ifdef ASSERT 1979 { 1980 ThreadInVMfromUnknown tiv; 1981 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop"); 1982 } 1983 #endif 1984 oop_index = oop_recorder()->find_index(obj); 1985 } 1986 if (! immediate) { 1987 far_load_oop(dst, oop_index); 1988 } else { 1989 RelocationHolder rspec = oop_Relocation::spec(oop_index); 1990 mov(dst, Address((address)obj, rspec)); 1991 } 1992 } 1993 1994 // Move a metadata address into a register. 1995 void MacroAssembler::mov_metadata(Register dst, Metadata* obj) { 1996 int oop_index; 1997 if (obj == NULL) { 1998 oop_index = oop_recorder()->allocate_metadata_index(obj); 1999 } else { 2000 oop_index = oop_recorder()->find_index(obj); 2001 } 2002 RelocationHolder rspec = metadata_Relocation::spec(oop_index); 2003 mov(dst, Address((address)obj, rspec)); 2004 } 2005 2006 void MacroAssembler::far_load(Register dst, address addr) { 2007 address far_load_addr = pc(); 2008 add(dst, r15_pc, 0); 2009 ldr(dst, Address(dst)); 2010 2011 NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr; 2012 far_load->set_data_addr((intptr_t*) addr); 2013 } 2014 2015 void MacroAssembler::far_load_oop(Register dst, int oop_index) { 2016 relocate(oop_Relocation::spec(oop_index)); 2017 // can't provide meaningful addr, give far_load addr itself 2018 far_load(dst, pc()); 2019 } 2020 2021 void MacroAssembler::far_load_metadata(Register dst, int metadata_index) { 2022 relocate(metadata_Relocation::spec(metadata_index)); 2023 // can't provide meaningful addr, give far_load addr itself 2024 far_load(dst, pc()); 2025 } 2026 2027 void MacroAssembler::far_load_const(Register dst, address const_addr) { 2028 relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS)); 2029 far_load(dst, const_addr); 2030 } 2031 2032 Address MacroAssembler::constant_oop_address(jobject obj) { 2033 #ifdef ASSERT 2034 { 2035 ThreadInVMfromUnknown tiv; 2036 assert(oop_recorder() != NULL, "this assembler needs an OopRecorder"); 2037 assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop"); 2038 } 2039 #endif 2040 int oop_index = oop_recorder()->find_index(obj); 2041 return Address((address)obj, oop_Relocation::spec(oop_index)); 2042 } 2043 2044 // Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes. 2045 void MacroAssembler::tlab_allocate(Register obj, 2046 Register var_size_in_bytes, 2047 int con_size_in_bytes, 2048 Register t1, 2049 Register t2, 2050 Label& slow_case) { 2051 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2052 bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case); 2053 } 2054 2055 // Defines obj, preserves var_size_in_bytes 2056 void MacroAssembler::eden_allocate(Register obj, 2057 Register var_size_in_bytes, 2058 int con_size_in_bytes, 2059 Register t1, 2060 Label& slow_case) { 2061 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2062 bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case); 2063 } 2064 2065 // Zero words; len is in bytes 2066 // Destroys all registers except addr 2067 // len must be a nonzero multiple of wordSize 2068 void MacroAssembler::zero_memory(Register addr, Register len, Register t1) { 2069 assert_different_registers(addr, len, t1, rscratch1, rscratch2); 2070 2071 #ifdef ASSERT 2072 { Label L; 2073 tst(len, BytesPerWord - 1); 2074 b(L, Assembler::EQ); 2075 stop("len is not a multiple of BytesPerWord"); 2076 bind(L); 2077 } 2078 #endif 2079 2080 #ifndef PRODUCT 2081 block_comment("zero memory"); 2082 #endif 2083 2084 Label loop; 2085 Label entry; 2086 2087 // Algorithm: 2088 // 2089 // scratch1 = cnt & 7; 2090 // cnt -= scratch1; 2091 // p += scratch1; 2092 // switch (scratch1) { 2093 // do { 2094 // cnt -= 8; 2095 // p[-8] = 0; 2096 // case 7: 2097 // p[-7] = 0; 2098 // case 6: 2099 // p[-6] = 0; 2100 // // ... 2101 // case 1: 2102 // p[-1] = 0; 2103 // case 0: 2104 // p += 8; 2105 // } while (cnt); 2106 // } 2107 2108 const int unroll = 8; // Number of str instructions we'll unroll 2109 2110 lsr(len, len, LogBytesPerWord); 2111 andr(rscratch1, len, unroll - 1); // tmp1 = cnt % unroll 2112 sub(len, len, rscratch1); // cnt -= unroll 2113 // t1 always points to the end of the region we're about to zero 2114 add(t1, addr, rscratch1, lsl(LogBytesPerWord)); 2115 adr(rscratch2, entry); 2116 sub(rscratch2, rscratch2, rscratch1, lsl(2)); 2117 mov(rscratch1, 0); 2118 b(rscratch2); 2119 bind(loop); 2120 sub(len, len, unroll); 2121 for (int i = -unroll; i < 0; i++) 2122 str(rscratch1, Address(t1, i * wordSize)); 2123 bind(entry); 2124 add(t1, t1, unroll * wordSize); 2125 cbnz(len, loop); 2126 } 2127 2128 void MacroAssembler::verify_tlab() { 2129 #ifdef ASSERT 2130 if (UseTLAB && VerifyOops) { 2131 Label next, ok; 2132 2133 strd(rscratch2, rscratch1, Address(pre(sp, -16))); 2134 2135 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 2136 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset()))); 2137 cmp(rscratch2, rscratch1); 2138 b(next, Assembler::HS); 2139 STOP("assert(top >= start)"); 2140 should_not_reach_here(); 2141 2142 bind(next); 2143 ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset()))); 2144 ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset()))); 2145 cmp(rscratch2, rscratch1); 2146 b(ok, Assembler::HS); 2147 STOP("assert(top <= end)"); 2148 should_not_reach_here(); 2149 2150 bind(ok); 2151 ldrd(rscratch2, rscratch1, Address(post(sp, 16))); 2152 } 2153 #endif 2154 } 2155 2156 // Writes to stack successive pages until offset reached to check for 2157 // stack overflow + shadow pages. This clobbers tmp. 2158 void MacroAssembler::bang_stack_size(Register size, Register tmp) { 2159 assert_different_registers(tmp, size, rscratch1); 2160 mov(tmp, sp); 2161 // Bang stack for total size given plus shadow page size. 2162 // Bang one page at a time because large size can bang beyond yellow and 2163 // red zones. 2164 Label loop; 2165 mov(rscratch1, os::vm_page_size()); 2166 bind(loop); 2167 lea(tmp, Address(tmp, -os::vm_page_size())); 2168 subs(size, size, rscratch1); 2169 str(size, Address(tmp)); 2170 b(loop, Assembler::GT); 2171 2172 // Bang down shadow pages too. 2173 // At this point, (tmp-0) is the last address touched, so don't 2174 // touch it again. (It was touched as (tmp-pagesize) but then tmp 2175 // was post-decremented.) Skip this address by starting at i=1, and 2176 // touch a few more pages below. N.B. It is important to touch all 2177 // the way down to and including i=StackShadowPages. 2178 for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) { 2179 // this could be any sized move but this is can be a debugging crumb 2180 // so the bigger the better. 2181 lea(tmp, Address(tmp, -os::vm_page_size())); 2182 str(size, Address(tmp)); 2183 } 2184 } 2185 2186 2187 // Move the address of the polling page into dest. 2188 void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) { 2189 if (SafepointMechanism::uses_thread_local_poll()) { 2190 ldr(dest, Address(rthread, Thread::polling_page_offset())); 2191 } else { 2192 mov(dest, Address(page, rtype)); 2193 } 2194 } 2195 2196 // Move the address of the polling page into r, then read the polling 2197 // page. 2198 address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) { 2199 get_polling_page(r, page, rtype); 2200 return read_polling_page(r, rtype); 2201 } 2202 2203 address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) { 2204 InstructionMark im(this); 2205 code_section()->relocate(inst_mark(), rtype); 2206 // It's ok to load to reg from reg + off (without write-back) 2207 ldr(r, Address(r, 0)); 2208 return inst_mark(); 2209 } 2210 2211 // Helper functions for 64-bit multipliction, division and remainder 2212 // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm> 2213 void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) { 2214 Register Rdh = (Register)(Rd->encoding_nocheck() + 1); 2215 Register Rnh = (Register)(Rn->encoding_nocheck() + 1); 2216 Register Rmh = (Register)(Rm->encoding_nocheck() + 1); 2217 2218 mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh); 2219 } 2220 2221 // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm> 2222 void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) { 2223 assert_different_registers(Rn, Rnh); 2224 assert_different_registers(Rm, Rmh); 2225 assert_different_registers(Rd, Rdh); // umull restriction 2226 const Register t = rscratch1; 2227 2228 mul(t, Rm, Rnh); 2229 mla(t, Rn, Rmh, t); 2230 umull(Rd, Rdh, Rm, Rn); 2231 add(Rdh, t, Rdh); 2232 } 2233 2234 2235 int64_t internal_ldiv(int64_t a, int64_t b) { 2236 return a / b; 2237 } 2238 2239 int64_t internal_lmod(int64_t a, int64_t b) { 2240 return a % b; 2241 } 2242 2243 void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) { 2244 Register cnt = rscratch1; 2245 Register mod = rscratch2; 2246 Register sign = r14; 2247 assert_different_registers(num, den, rscratch1, rscratch2, r14); 2248 2249 // FIXME This works by first converting any negative values to positive ones, however 2250 // it is not possible to express |INT_MIN|. Need to fix this 2251 2252 //Convert to positive values 2253 mov(sign, 0); 2254 2255 cmp(num, 0); 2256 mov(sign, 1, MI); 2257 rsb(num, num, 0, MI); 2258 2259 cmp(den, 0); 2260 if(!want_mod) eor(sign, sign, 1, MI); 2261 rsb(den, den, 0, MI); 2262 2263 // Algorithm from 2264 // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt 2265 // Graeme Williams 2266 mov(cnt, 28); 2267 mov(mod, num, lsr(4)); 2268 cmp(den, mod, lsr(12)); 2269 sub(cnt, cnt, 16, Assembler::LE); 2270 mov(mod, mod, lsr(16), Assembler::LE); 2271 cmp(den, mod, lsr(4)); 2272 sub(cnt, cnt, 8, Assembler::LE); 2273 mov(mod, mod, lsr(8), Assembler::LE); 2274 cmp(den, mod); 2275 sub(cnt, cnt, 4, Assembler::LE); 2276 mov(mod, mod, lsr(4), Assembler::LE); 2277 mov(num, num, lsl(cnt)); 2278 rsb(den, den, 0); 2279 2280 adds(num, num, num); 2281 //Now skip over cnt copies of the 3 instr. loop. 2282 add(cnt, cnt, cnt, lsl(1)); 2283 add(r15_pc, r15_pc, cnt, lsl(2)); 2284 mov(r0, r0); 2285 2286 for(int i = 0; i < 32; i++) { 2287 adcs(mod, den, mod, lsl(1)); 2288 sub(mod, mod, den, Assembler::LO); 2289 adcs(num, num, num); 2290 } 2291 2292 cmp(sign, 0); 2293 rsb(res, want_mod? mod : num, 0, NE); 2294 mov(res, want_mod? mod : num, EQ); 2295 } 2296 2297 2298 // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm> 2299 // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm> 2300 // <Rd> = <Rn> / <Rm> 2301 // <Rd> = <Rn> % <Rm> 2302 void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) { 2303 //Dispatch to best possible 2304 Register Rdh = (Register)(Rd->encoding_nocheck() + 1); 2305 Register Rnh = (Register)(Rn->encoding_nocheck() + 1); 2306 Register Rmh = (Register)(Rm->encoding_nocheck() + 1); 2307 2308 assert(32 == width || 64 == width, "Invalid width"); 2309 bool is64b = 64 == width; 2310 2311 if(is64b) { 2312 assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2); 2313 } 2314 2315 if(!is64b && VM_Version::features() & FT_HW_DIVIDE) { 2316 // Emit a hw instruction sequnce. 2317 if(want_remainder) { 2318 sdiv(rscratch1, Rn, Rm); 2319 mls(Rd, rscratch1, Rm, Rn); 2320 } else { 2321 sdiv(Rd, Rn, Rm); 2322 } 2323 } else if(!is64b) { 2324 // Fall back to assembly software routine 2325 divide32(Rd, Rn, Rm, want_remainder); 2326 } else { 2327 // Fall back to C software routine for 2328 // 64 bit divide/mod 2329 if(Rn != r0) { 2330 mov(rscratch1, Rm); 2331 mov(rscratch2, Rmh); 2332 2333 mov(r0, Rn); 2334 mov(r1, Rnh); 2335 2336 mov(r2, rscratch1); 2337 mov(r3, rscratch2); 2338 } else if(Rm != r2) { 2339 mov(r2, Rm); 2340 mov(r3, Rmh); 2341 } 2342 address function; 2343 if(want_remainder) function = (address)internal_lmod; 2344 else function = (address)internal_ldiv; 2345 2346 mov(rscratch1, function); 2347 bl(rscratch1); 2348 if(Rd != r0) { 2349 mov(Rd, r0); 2350 if(is64b) mov(Rdh, r1); 2351 } 2352 } 2353 } 2354 2355 void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) { 2356 assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width"); 2357 // Dispatch to the best sequence 2358 if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) { 2359 // Can use extend X 2360 switch(width){ 2361 case 8: uxtb(dest, source, ror(lsb)); break; 2362 case 16: uxth(dest, source, ror(lsb)); break; 2363 default: break; 2364 } 2365 } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) { 2366 ubfx(dest, source, lsb, width); 2367 } else { 2368 // Do two shifts 2369 lsl(dest, source, 32 - (width + lsb)); 2370 lsr(dest, dest, 32 - width); 2371 } 2372 } 2373 2374 2375 void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) { 2376 assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register"); 2377 assert((Register) (Rt + 1) == Rt2, "Must be contiguous"); 2378 if(VM_Version::features() & FT_SINGLE_CORE) { 2379 ldrd(Rt, Rbase); 2380 } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) { 2381 #ifdef ASSERT 2382 Label lbl; 2383 tst(Rbase, 7); 2384 b(lbl, EQ); 2385 stop("atomic_ldrd is not doubleword aligned!"); 2386 bind(lbl); 2387 #endif // ASSERT 2388 2389 ldrexd(Rt, Rbase); 2390 } else { 2391 // TODO: Find Java way of logging 2392 static bool warning_printed = false; 2393 if(!warning_printed) { 2394 fprintf(stderr, "Unable to provide atomic doubleword load.\n"); 2395 warning_printed = true; 2396 } 2397 ldrd(Rt, Rbase); 2398 } 2399 } 2400 2401 void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase, 2402 Register temp, Register temp2) { 2403 assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register"); 2404 assert((Register) (Rt + 1) == Rt2, "Must be contiguous"); 2405 assert((Register) (temp + 1) == temp2, "Must be contiguous"); 2406 assert_different_registers(temp, Rt, Rbase, temp2); 2407 if(VM_Version::features() & FT_SINGLE_CORE) { 2408 strd(Rt, Rbase); 2409 } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) { 2410 // First need to gain exclusive access 2411 Label retry; 2412 2413 #ifdef ASSERT 2414 tst(Rbase, 7); 2415 b(retry, EQ); 2416 stop("atomic_strd is not doubleword aligned!"); 2417 #endif // ASSERT 2418 2419 bind(retry); 2420 ldrexd(temp, Rbase); 2421 strexd(temp, Rt, Rbase); 2422 cmp(temp, 0); 2423 b(retry, NE); 2424 } else { 2425 // TODO: Find Java way of logging 2426 static bool warning_printed = false; 2427 if(!warning_printed) { 2428 fprintf(stderr, "Unable to provide atomic doubleword store.\n"); 2429 warning_printed = true; 2430 } 2431 strd(Rt, Rbase); 2432 } 2433 } 2434 2435 2436 #define ENABLE_DEBUGGING 0 2437 // Helloworld is 2,482,397 2438 uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L; 2439 2440 uint32_t MacroAssembler::bytecodes_executed = 0; 2441 2442 int MacroAssembler::enable_debug = 0; 2443 int MacroAssembler::enable_method_debug = 0; 2444 int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING; 2445 2446 #define N_J_BYTECODES 238 2447 const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0", 2448 "lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w", 2449 "iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2", 2450 "lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2", 2451 "aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore", 2452 "dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0", 2453 "fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3", 2454 "iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1", 2455 "dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul", 2456 "lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg", 2457 "ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f", 2458 "i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg", 2459 "dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge", 2460 "ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn", 2461 "lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield", 2462 "invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray", 2463 "anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide", 2464 "multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield", 2465 "fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield", 2466 "fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield", 2467 "fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0", 2468 "fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch", 2469 "fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "nofast_getfield", "nofast_putfield", 2470 "nofast_aload_0", "nofast_iload", "INVALID"}; 2471 2472 int bytecodes_seen[256]; 2473 2474 void MacroAssembler::init_unseen_bytecodes() { 2475 for(int i = 0; i < 256; i++ ) { 2476 bytecodes_seen[i] = 0; 2477 } 2478 } 2479 2480 void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) { 2481 if(ENABLE_DEBUGGING) { 2482 mov(scratch, (address)bytecodes_seen); 2483 add(scratch, scratch, bc_reg, lsl(2)); 2484 add(bc_reg, bc_reg, 1); 2485 str(bc_reg, Address(scratch)); 2486 sub(bc_reg, bc_reg, 1); 2487 } 2488 } 2489 2490 void MacroAssembler::print_unseen_bytecodes() { 2491 if(ENABLE_DEBUGGING) { 2492 printf("=== Unseen bytecodes ===\n"); 2493 for(int i = 0; i < N_J_BYTECODES; i++) { 2494 if(0 == bytecodes_seen[i]) { 2495 printf("\t%s\n", j_bytecodes[i]); 2496 } 2497 } 2498 printf("=== End unseen ===\n"); 2499 } else { 2500 printf("Not kept track, enable debugging to view info\n"); 2501 } 2502 fflush(stdout); 2503 } 2504 2505 int machine_state_regset = 0b0101111111111111; 2506 int machine_state_float_regset = 0b11; 2507 2508 void MacroAssembler::save_machine_state() { 2509 stmdb(sp, machine_state_regset); 2510 if(hasFPU()) { 2511 vstmdb_f64(sp, machine_state_float_regset); 2512 } 2513 enter(); 2514 } 2515 2516 void MacroAssembler::restore_machine_state() { 2517 leave(); 2518 if(hasFPU()) { 2519 vldmia_f64(sp, machine_state_float_regset); 2520 } 2521 ldmia(sp, machine_state_regset); 2522 } 2523 2524 void internal_internal_printf(const char *fmt, ...) { 2525 va_list args; 2526 va_start (args, fmt); 2527 vprintf (fmt, args); 2528 fflush(stdout); 2529 va_end(args); 2530 } 2531 2532 void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) { 2533 char buf[2048]; 2534 char fmt[2048]; 2535 buf[0] = '\0'; 2536 const char *thread_str = "THREAD 0x%08x : "; 2537 int id = pthread_self(); 2538 strcpy(fmt, format); 2539 2540 char *str = strtok(fmt, "\n"); 2541 int nreplace = 0; 2542 while(str) { 2543 strcpy(buf, thread_str); 2544 strcat(buf, str); 2545 strcat(buf, "\n"); 2546 internal_internal_printf((const char*)buf, id, a, b, c); 2547 str = strtok(NULL, "\n"); 2548 } 2549 } 2550 2551 void MacroAssembler::get_bytecode(Register dst, Register bc) { 2552 if(ENABLE_DEBUGGING) { 2553 int nbytecodes = N_J_BYTECODES; 2554 mov(dst, (address)j_bytecodes); 2555 cmp(bc, nbytecodes); 2556 2557 ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT); 2558 ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE); 2559 } 2560 } 2561 2562 int invocation_depth_count = -1; //TODO remove this with debugging info 2563 2564 #define MAX_FCALL_DEPTH 4096 2565 struct thread_method_record{ 2566 int thread_id; 2567 char names[MAX_FCALL_DEPTH][512]; 2568 int invocation_depth_count; 2569 }; 2570 int ntmrs = 0; 2571 #define MAX_TMRS 10 2572 thread_method_record tmr_list[MAX_TMRS]; 2573 2574 void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) { 2575 int id = pthread_self(); 2576 *thread_id = id; 2577 for(int i = 0; i < ntmrs; i++) { 2578 thread_method_record *tmr = &tmr_list[i]; 2579 if(id == tmr->thread_id) { 2580 // Add a new frame 2581 if(tmr->invocation_depth_count >= -1 && 2582 tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) { 2583 *invocation_depth_count = ++(tmr->invocation_depth_count); 2584 *name = tmr->names[tmr->invocation_depth_count]; 2585 meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512); 2586 return; 2587 } else { 2588 fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count); 2589 exit(1); 2590 } 2591 } 2592 } 2593 // Add a new thread 2594 if(ntmrs >= MAX_TMRS) { 2595 fprintf(stderr, "Too many tmrs\n"); 2596 exit(1); 2597 } 2598 //Create a new tmr 2599 tmr_list[ntmrs].thread_id = id; 2600 tmr_list[ntmrs].invocation_depth_count = 0; 2601 meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512); 2602 *invocation_depth_count = 0; 2603 *name = tmr_list[ntmrs].names[0]; 2604 ntmrs++; 2605 } 2606 2607 void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) { 2608 int id = pthread_self(); 2609 *thread_id = id; 2610 for(int i = 0; i < ntmrs; i++) { 2611 thread_method_record *tmr = &tmr_list[i]; 2612 if(id == tmr->thread_id) { 2613 if(tmr->invocation_depth_count >= 0 && 2614 tmr->invocation_depth_count < MAX_FCALL_DEPTH) { 2615 // Pop frame 2616 *name = tmr->names[tmr->invocation_depth_count]; 2617 *invocation_depth_count = (tmr->invocation_depth_count)--; 2618 return; 2619 } else if ( -1 == tmr->invocation_depth_count) { 2620 *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)"; 2621 *invocation_depth_count = 0; 2622 return; 2623 } else { 2624 fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count); 2625 exit(1); 2626 } 2627 } 2628 } 2629 fprintf(stderr, "Unable to find suitable tmr\n"); 2630 exit(1); 2631 } 2632 2633 void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) { 2634 sprintf(buf, "THREAD 0x%08x : ", id); 2635 for(int i = 0; i < invocation_depth_count; i++) { 2636 strcat(buf, " "); 2637 } 2638 } 2639 2640 2641 void print_entry(Method *meth, int native) { 2642 char *name; 2643 int invocation_depth_count, id; 2644 push_tmr(meth, &id, &invocation_depth_count, &name); 2645 2646 if(MacroAssembler::enable_method_debug) { 2647 char buf[4096], buf_b[2048]; 2648 prepare_entry_exit_prefix(buf, id, invocation_depth_count); 2649 if(native) { 2650 sprintf(buf_b, "CALL NATIVE : %s\n", name); 2651 } else { 2652 sprintf(buf_b, "CALL JAVA : %s\n", name); 2653 } 2654 strcat(buf, buf_b); 2655 printf("%s", buf); 2656 fflush(stdout); 2657 } 2658 } 2659 2660 void print_exit(bool normal) { 2661 char *name; 2662 int invocation_depth_count, id; 2663 pop_tmr(&id, &invocation_depth_count, &name); 2664 2665 if(MacroAssembler::enable_method_debug) { 2666 char buf[4096], buf_b[2048]; 2667 prepare_entry_exit_prefix(buf, id, invocation_depth_count); 2668 sprintf(buf_b, normal ? "EXIT : %s\n" : "EXCPN EXIT : %s\n", name); 2669 strcat(buf, buf_b); 2670 printf("%s", buf); 2671 fflush(stdout); 2672 } 2673 } 2674 2675 void MacroAssembler::print_method_entry(Register rmethod, bool native) { 2676 if(ENABLE_DEBUGGING) { 2677 save_machine_state(); 2678 2679 bic(sp, sp, 7); // 8-byte align stack 2680 mov(rscratch2, (address)print_entry); 2681 mov(r0, rmethod); 2682 mov(r1, native); 2683 bl(rscratch2); 2684 2685 restore_machine_state(); 2686 } 2687 } 2688 2689 void MacroAssembler::print_method_exit(bool normal) { 2690 if(ENABLE_DEBUGGING) { 2691 save_machine_state(); 2692 2693 bic(sp, sp, 7); // 8-byte align stack 2694 mov(rscratch2, (address)print_exit); 2695 mov(r0, normal); 2696 bl(rscratch2); 2697 2698 restore_machine_state(); 2699 } 2700 } 2701 2702 void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) { 2703 if(ENABLE_DEBUGGING) { 2704 Label skip; 2705 save_machine_state(); 2706 2707 mov(rscratch1, ra); 2708 str(rscratch1, Address(pre(sp, -wordSize))); 2709 mov(rscratch1, rb); 2710 str(rscratch1, Address(pre(sp, -wordSize))); 2711 mov(rscratch1, rc); 2712 str(rscratch1, Address(pre(sp, -wordSize))); 2713 2714 if(!important) { 2715 mov(r0, (address)&enable_debug); 2716 ldr(r0, Address(r0)); 2717 cmp(r0, 0); 2718 b(skip, Assembler::EQ); 2719 } 2720 2721 int sp_difference = wordSize * (count_bits(machine_state_regset) + 2722 2 * count_bits(machine_state_float_regset) + 2723 2 + 3); //Frame entry and saved 2724 2725 mov(r0, (address)fmt); 2726 if(ra != sp) ldr(r1, Address(sp, 2 * wordSize)); 2727 else add(r1, sp, sp_difference); 2728 2729 if(rb != sp) ldr(r2, Address(sp, wordSize)); 2730 else add(r2, sp, sp_difference); 2731 2732 if(rc != sp) ldr(r3, Address(sp)); 2733 else add(r3, sp, sp_difference); 2734 2735 bic(sp, sp, 7); // 8-byte align stack 2736 2737 mov(rscratch2, (address)internal_printf); 2738 bl(rscratch2); 2739 2740 bind(skip); 2741 restore_machine_state(); 2742 } 2743 } 2744 2745 void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) { 2746 reg_printf_internal(false, fmt, ra, rb, rc); 2747 } 2748 2749 void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) { 2750 reg_printf_internal(true, fmt, ra, rb, rc); 2751 } 2752 2753 // When debugging, set the break on bkpnt 2754 void bkpnt() { return; } 2755 void MacroAssembler::create_breakpoint() { 2756 if(ENABLE_DEBUGGING) { 2757 save_machine_state(); 2758 bic(sp, sp, 7); // 8-byte align stack 2759 2760 mov(rscratch2, (address) bkpnt); 2761 bl(rscratch2); 2762 2763 restore_machine_state(); 2764 } 2765 } 2766 2767 2768 void MacroAssembler::print_cpool(InstanceKlass *klass) { 2769 ttyLocker ttyl; 2770 klass->constants()->print_on(tty); 2771 } 2772 2773 int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) { 2774 if((0 == Rt->encoding_nocheck() % 2 && 2775 (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) && 2776 (uabs(adr.offset()) < (1 << 8))) { 2777 /* Good to go with a ldrd */ 2778 ldrd(Rt, adr, cond); 2779 return 0x0; 2780 } else { 2781 return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm, 2782 &Assembler::ldr, Rtmp, cond); 2783 } 2784 } 2785 2786 int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) { 2787 if((0 == Rt->encoding_nocheck() % 2 && 2788 (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) && 2789 (uabs(adr.offset()) < (1 << 8))) { 2790 /* Good to go with a strd */ 2791 strd(Rt, adr, cond); 2792 } else { 2793 double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond); 2794 } 2795 return 0x0; 2796 } 2797 2798 int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr, 2799 void (Assembler::* mul)(unsigned, const Address&, Condition), 2800 void (Assembler::* sgl)(Register, const Address&, Condition), 2801 Register Rtmp, Condition cond) { 2802 if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) && 2803 (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) { 2804 /* Do a load or store multiple instruction */ 2805 (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond); 2806 } else if (!adr.uses(Rt)) { 2807 double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond); 2808 } else { 2809 // need to reshuffle operation, otherwise write to Rt destroys adr 2810 if (adr.get_mode() != Address::reg) { 2811 // offset-based addressing. hence Rt2 could not be by adr 2812 if (adr.get_wb_mode() == Address::pre) { 2813 (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond); 2814 (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond); 2815 } else if (adr.get_wb_mode() == Address::post) { 2816 (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); 2817 (this->*sgl)(Rt, adr, cond); 2818 } else if (adr.get_wb_mode() == Address::off) { 2819 (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); 2820 (this->*sgl)(Rt, adr, cond); 2821 } else { 2822 ShouldNotReachHere(); 2823 } 2824 } else { 2825 // index-based addressing. both Rt and Rt2 could be used by adr 2826 // hence temp register is necessary 2827 adr.lea(this, Rtmp); 2828 double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond); 2829 // adr.lea have only address manipulation and cannot cause trap. 2830 // first instruction when NPE can occur is in double_ldst_failed_dispatch 2831 // so shift offset appropriately 2832 return 0x4; 2833 } 2834 } 2835 return 0x0; 2836 } 2837 2838 void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr, 2839 void (Assembler::* mul)(unsigned, const Address&, Condition), 2840 void (Assembler::* sgl)(Register, const Address&, Condition), 2841 Condition cond) { 2842 if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) && 2843 (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) { 2844 /* Do a store multiple instruction */ 2845 (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond); 2846 } else { 2847 if (adr.get_mode() != Address::reg) { 2848 // offset-based addressing 2849 if (adr.get_wb_mode() == Address::pre) { 2850 (this->*sgl)(Rt, adr, cond); 2851 (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond); 2852 } else if (adr.get_wb_mode() == Address::post) { 2853 (this->*sgl)(Rt, adr, cond); 2854 (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond); 2855 } else if (adr.get_wb_mode() == Address::off) { 2856 (this->*sgl)(Rt, adr, cond); 2857 (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond); 2858 } else { 2859 ShouldNotReachHere(); 2860 } 2861 } else { 2862 // index-based addressing 2863 if (adr.get_wb_mode() == Address::pre) { 2864 // current implementation does not use Address::pre for indexed access 2865 ShouldNotReachHere(); 2866 } else if (adr.get_wb_mode() == Address::post) { 2867 // current implementation does not use Address:post for indexed access 2868 // enable the code below and implement proper post() method if it is required 2869 #if 0 2870 (this->*sgl)(Rt, Address(post(adr.base(), wordSize)), cond); 2871 (this->*sgl)(Rt2, Address(post(adr.base(), adr.index(), adr.shift())), cond); 2872 sub(adr.base(), wordSize, cond); 2873 #endif 2874 ShouldNotReachHere(); 2875 } else if (adr.get_wb_mode() == Address::off) { 2876 (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond); 2877 (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond); 2878 compensate_addr_offset(adr, cond); 2879 } else { 2880 ShouldNotReachHere(); 2881 } 2882 } 2883 } 2884 } 2885 2886 #ifdef ASSERT 2887 void MacroAssembler::verify_stack_alignment() { 2888 if (StackAlignmentInBytes > 4) { 2889 Label x; 2890 tst(sp, StackAlignmentInBytes-1); 2891 b(x, EQ); 2892 stop("stack unaligned"); 2893 bind(x); 2894 } 2895 } 2896 #endif 2897 2898 /** 2899 * Code for BigInteger::multiplyToLen() instrinsic. 2900 * 2901 * r0: x 2902 * r1: xlen 2903 * r2: y 2904 * r3: ylen 2905 * r4: z 2906 * r5: zlen 2907 * 2908 */ 2909 void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen, 2910 Register z, Register zlen, 2911 Register tmp1, Register tmp2, Register tmp3, Register tmp4, 2912 Register tmp5, Register tmp6) { 2913 2914 assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6); 2915 2916 const Register xc = xlen; 2917 const Register yc = tmp1; 2918 const Register zc = tmp2; 2919 2920 const Register vz = tmp3; 2921 const Register carry = tmp4; 2922 const Register vx = tmp5; 2923 const Register vy = tmp6; 2924 2925 // ensure y (inner cycle) is shorter than x (outer cycle), this in theory uses CPU caches more effectively 2926 Label L_x_longer; 2927 cmp(xlen, ylen); 2928 b(L_x_longer, Assembler::GE); 2929 #define SWP(X, Y) \ 2930 mov(tmp1, Y); \ 2931 mov(Y, X); \ 2932 mov(X, tmp1) 2933 SWP(x, y); 2934 SWP(xlen, ylen); 2935 bind(L_x_longer); 2936 2937 lea(xc, Address(x, xlen, lsl(LogBytesPerInt))); // x[xstart] 2938 lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[idx] 2939 lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[kdx] 2940 2941 // First Loop. 2942 // 2943 // final static long LONG_MASK = 0xffffffffL; 2944 // int xstart = xlen - 1; 2945 // int ystart = ylen - 1; 2946 // long carry = 0; 2947 // for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) { 2948 // long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry; 2949 // z[kdx] = (int)product; 2950 // carry = product >>> 32; 2951 // } 2952 // z[xstart] = (int)carry; 2953 // 2954 2955 ldr(vx, Assembler::pre(xc, -BytesPerInt)); 2956 mov(carry, 0); 2957 2958 Label L_loop_1; 2959 bind(L_loop_1); 2960 ldr(vy, Assembler::pre(yc, -BytesPerInt)); 2961 mov(vz, 0); 2962 umaal(vz, carry, vx, vy); 2963 str(vz, Assembler::pre(zc, -BytesPerInt)); 2964 cmp(yc, y); 2965 b(L_loop_1, Assembler::GT); 2966 2967 str(carry, Address(zc, -BytesPerInt)); 2968 2969 // Second and third (nested) loops. 2970 // 2971 // for (int i = xstart-1; i >= 0; i--) { // Second loop 2972 // carry = 0; 2973 // for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop 2974 // long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) + 2975 // (z[k] & LONG_MASK) + carry; 2976 // z[k] = (int)product; 2977 // carry = product >>> 32; 2978 // } 2979 // z[i] = (int)carry; 2980 // } 2981 // 2982 Label L_loop_2, L_loop_3; 2983 bind(L_loop_2); 2984 2985 sub(zlen, zlen, 1); 2986 lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[jdx] 2987 lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[k] 2988 2989 ldr(vx, Assembler::pre(xc, -BytesPerInt)); 2990 mov(carry, 0); 2991 2992 bind(L_loop_3); 2993 ldr(vy, Assembler::pre(yc, -BytesPerInt)); 2994 ldr(vz, Assembler::pre(zc, -BytesPerInt)); // r1 is vz, r2 is carry 2995 umaal(vz, carry, vx, vy); 2996 str(vz, Address(zc)); 2997 cmp(yc, y); 2998 b(L_loop_3, Assembler::GT); 2999 3000 str(carry, Address(zc, -BytesPerInt)); 3001 cmp(xc, x); 3002 b(L_loop_2, Assembler::GT); 3003 } 3004 3005 /** 3006 * Code for BigInteger::mulAdd() instrinsic. 3007 * 3008 * r0: out 3009 * r1: in 3010 * r2: offset 3011 * r3: len 3012 * r4: k 3013 */ 3014 void MacroAssembler::mul_add(Register out, Register in, Register offset, Register len, Register k, 3015 Register tmp1, Register tmp2, Register tmp3) { 3016 3017 assert_different_registers(out, in, offset, len, k, tmp1, tmp2, tmp3); 3018 3019 Register vin = tmp1; 3020 Register vout = tmp2; 3021 Register carry = tmp3; 3022 Register result = r0; 3023 3024 // long kLong = k & LONG_MASK; 3025 // long carry = 0; 3026 // 3027 // offset = out.length-offset - 1; 3028 // for (int j=len-1; j >= 0; j--) { 3029 // long product = (in[j] & LONG_MASK) * kLong + 3030 // (out[offset] & LONG_MASK) + carry; 3031 // out[offset--] = (int)product; 3032 // carry = product >>> 32; 3033 // } 3034 // return (int)carry; 3035 3036 lea(in, Address(in, len, lsl(LogBytesPerInt))); 3037 lea(out, Address(out, offset, lsl(LogBytesPerInt))); 3038 mov(carry, 0); 3039 3040 Label L_loop; 3041 bind(L_loop); 3042 ldr(vin, Assembler::pre(in, -BytesPerInt)); 3043 ldr(vout, Assembler::pre(out, -BytesPerInt)); 3044 umaal(vout, carry, vin, k); 3045 str(vout, Address(out)); 3046 subs(len, len, 1); 3047 b(L_loop, Assembler::GT); 3048 3049 mov(result, carry); 3050 } 3051 3052 /** 3053 * Emits code to update CRC-32 with a byte value according to constants in table 3054 * 3055 * @param [in,out]crc Register containing the crc. 3056 * @param [in]val Register containing the byte to fold into the CRC. 3057 * @param [in]table Register containing the table of crc constants. 3058 * 3059 * uint32_t crc; 3060 * val = crc_table[(val ^ crc) & 0xFF]; 3061 * crc = val ^ (crc >> 8); 3062 * 3063 */ 3064 void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) { 3065 eor(val, val, crc); 3066 andr(val, val, 0xff); 3067 ldr(val, Address(table, val, lsl(2))); 3068 eor(crc, val, crc, Assembler::lsr(8)); 3069 } 3070 3071 /** 3072 * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3 3073 * 3074 * @param [in,out]crc Register containing the crc. 3075 * @param [in]v Register containing the 32-bit to fold into the CRC. 3076 * @param [in]table0 Register containing table 0 of crc constants. 3077 * @param [in]table1 Register containing table 1 of crc constants. 3078 * @param [in]table2 Register containing table 2 of crc constants. 3079 * @param [in]table3 Register containing table 3 of crc constants. 3080 * 3081 * uint32_t crc; 3082 * v = crc ^ v 3083 * crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24] 3084 * 3085 */ 3086 void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp, 3087 Register tmp2, Register table0, Register table1, Register table2, Register table3) { 3088 eor(v, crc, v); 3089 uxtb(tmp, v); 3090 uxtb(tmp2, v, ror(8)); 3091 ldr(crc, Address(table3, tmp, lsl(2))); 3092 ldr(tmp2, Address(table2, tmp2, lsl(2))); 3093 uxtb(tmp, v, ror(16)); 3094 eor(crc, crc, tmp2); 3095 uxtb(tmp2, v, ror(24)); 3096 ldr(tmp, Address(table1, tmp, lsl(2))); 3097 ldr(tmp2, Address(table0, tmp2, lsl(2))); 3098 eor(crc, crc, tmp); 3099 eor(crc, crc, tmp2); 3100 } 3101 3102 /** 3103 * @param crc register containing existing CRC (32-bit) 3104 * @param buf register pointing to input byte buffer (byte*) 3105 * @param len register containing number of bytes 3106 * @param table register that will contain address of CRC table 3107 * @param tmp scratch register 3108 */ 3109 void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len, 3110 Register table0, Register table1, Register table2, Register table3, 3111 Register tmp, Register tmp2, Register tmp3, int is_crc32c) { 3112 Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit; 3113 3114 if (!is_crc32c) 3115 inv(crc, crc); 3116 if (UseCRC32) { 3117 Label CRC_by4_loop, CRC_by1_loop; 3118 3119 subs(len, len, 4); 3120 b(CRC_by4_loop, Assembler::GE); 3121 adds(len, len, 4); 3122 b(CRC_by1_loop, Assembler::GT); 3123 b(L_exit); 3124 3125 BIND(CRC_by4_loop); 3126 ldr(tmp, Address(post(buf, 4))); 3127 subs(len, len, 4); 3128 if (!is_crc32c) 3129 crc32w(crc, crc, tmp); 3130 else // is_crc32c 3131 crc32cw(crc, crc, tmp); 3132 b(CRC_by4_loop, Assembler::GE); 3133 adds(len, len, 4); 3134 b(L_exit, Assembler::LE); 3135 BIND(CRC_by1_loop); 3136 ldrb(tmp, Address(post(buf, 1))); 3137 subs(len, len, 1); 3138 if (!is_crc32c) 3139 crc32b(crc, crc, tmp); 3140 else // is_crc32c 3141 crc32cb(crc, crc, tmp); 3142 b(CRC_by1_loop, Assembler::GT); 3143 BIND(L_exit); 3144 if (!is_crc32c) 3145 inv(crc, crc); 3146 return; 3147 } 3148 lea(table0, ExternalAddress( 3149 !is_crc32c ? 3150 StubRoutines::crc_table_addr() : 3151 StubRoutines::crc32c_table_addr() )); 3152 add(table1, table0, 1*256*sizeof(juint)); 3153 add(table2, table0, 2*256*sizeof(juint)); 3154 add(table3, table0, 3*256*sizeof(juint)); 3155 3156 BIND(L_align_by1_loop); 3157 tst(buf, 3); 3158 b(L_align_exit, Assembler::EQ); 3159 cmp(len, 0); 3160 b(L_exit, Assembler::EQ); 3161 sub(len, len, 1); 3162 ldrb(tmp, Address(post(buf, 1))); 3163 update_byte_crc32(crc, tmp, table0); 3164 b(L_align_by1_loop); 3165 3166 BIND(L_align_exit); 3167 3168 if(VM_Version::features() & FT_AdvSIMD) { 3169 if (UseNeon) { 3170 cmp(len, 32+12); // account for possible need for alignment 3171 b(L_cpu, Assembler::LT); 3172 3173 Label L_fold, L_align_by4_loop, L_align_by4_exit; 3174 3175 BIND(L_align_by4_loop); 3176 tst(buf, 0xf); 3177 b(L_align_by4_exit, Assembler::EQ); 3178 ldr(tmp, Address(post(buf, 4))); 3179 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3180 sub(len, len, 4); 3181 b(L_align_by4_loop); 3182 3183 BIND(L_align_by4_exit); 3184 3185 add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants 3186 3187 vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128); 3188 vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64); 3189 vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64); 3190 vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64); 3191 vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64); 3192 veor_64(d16, d16, d16); 3193 vmov_32(d16, 0, crc); 3194 3195 veor_64(d0, d0, d16); 3196 sub(len, len, 32); 3197 3198 BIND(L_fold); 3199 vmullp_8(q8, d0, d5); 3200 vmullp_8(q9, d0, d7); 3201 vmullp_8(q10, d0, d4); 3202 vmullp_8(q11, d0, d6); 3203 3204 vmullp_8(q12, d1, d5); 3205 vmullp_8(q13, d1, d7); 3206 vmullp_8(q14, d1, d4); 3207 vmullp_8(q15, d1, d6); 3208 3209 vuzp_128_16(q9, q8); 3210 veor_128(q8, q8, q9); 3211 3212 vuzp_128_16(q13, q12); 3213 veor_128(q12, q12, q13); 3214 3215 vshll_16u(q9, d16, 8); 3216 vshll_16u(q8, d17, 8); 3217 3218 vshll_16u(q13, d24, 8); 3219 vshll_16u(q12, d25, 8); 3220 3221 veor_128(q8, q8, q10); 3222 veor_128(q12, q12, q14); 3223 veor_128(q9, q9, q11); 3224 veor_128(q13, q13, q15); 3225 3226 veor_64(d19, d19, d18); 3227 veor_64(d18, d27, d26); 3228 3229 vshll_32u(q13, d18, 16); 3230 vshll_32u(q9, d19, 16); 3231 3232 veor_128(q9, q8, q9); 3233 veor_128(q13, q12, q13); 3234 3235 veor_64(d31, d26, d27); 3236 veor_64(d30, d18, d19); 3237 3238 vshl_128_64(q15, q15, 1); 3239 vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128); 3240 veor_128(q0, q0, q15); 3241 3242 subs(len, len, 16); 3243 b(L_fold, Assembler::GE); 3244 3245 vmov_32(tmp, d0, 0); 3246 mov(crc, 0); 3247 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3248 vmov_32(tmp, d0, 1); 3249 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3250 vmov_32(tmp, d1, 0); 3251 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3252 vmov_32(tmp, d1, 1); 3253 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3254 3255 add(len, len, 16); 3256 } 3257 } // if FT_AdvSIMD 3258 3259 BIND(L_cpu); 3260 subs(len, len, 8); 3261 b(L_by8_loop, Assembler::GE); 3262 adds(len, len, 8); 3263 b(L_by1_loop, Assembler::GT); 3264 b(L_exit); 3265 3266 BIND(L_by8_loop); 3267 ldr(tmp, Address(post(buf, 4))); 3268 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3269 ldr(tmp, Address(post(buf, 4))); 3270 update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3); 3271 subs(len, len, 8); 3272 b(L_by8_loop, Assembler::GE); 3273 adds(len, len, 8); 3274 b(L_exit, Assembler::LE); 3275 BIND(L_by1_loop); 3276 subs(len, len, 1); 3277 ldrb(tmp, Address(post(buf, 1))); 3278 update_byte_crc32(crc, tmp, table0); 3279 b(L_by1_loop, Assembler::GT); 3280 3281 BIND(L_exit); 3282 if (!is_crc32c) 3283 inv(crc, crc); 3284 } 3285 3286 /** 3287 * First round Key (cpu implementation) 3288 * @param in register containing address of input data (plain or cipher text) 3289 * @param key register containing address of the key data 3290 * @param t0 output register t0 3291 * @param t1 output register t1 3292 * @param t2 output register t2 3293 * @param t3 output register t3 3294 * @param t4 temporary register 3295 * @param t5 temporary register 3296 * @param t6 temporary register 3297 * @param t7 temporary register 3298 */ 3299 void MacroAssembler::kernel_aescrypt_firstRound(Register in, Register key, 3300 Register t0, Register t1, Register t2, Register t3, 3301 Register t4, Register t5, Register t6, Register t7) { 3302 3303 ldr(t4, Address(post(key, 4))); 3304 ldr(t5, Address(post(key, 4))); 3305 ldr(t6, Address(post(key, 4))); 3306 ldr(t7, Address(post(key, 4))); 3307 ldr(t0, Address(post(in, 4))); 3308 ldr(t1, Address(post(in, 4))); 3309 ldr(t2, Address(post(in, 4))); 3310 ldr(t3, Address(post(in, 4))); 3311 rev(t0, t0); 3312 rev(t1, t1); 3313 rev(t2, t2); 3314 rev(t3, t3); 3315 eor(t0, t0, t4); 3316 eor(t1, t1, t5); 3317 eor(t2, t2, t6); 3318 eor(t3, t3, t7); 3319 } 3320 3321 /** 3322 * AES ECB Round 3323 * @param table_te Register contains address of AES replacement table 3324 * @param key register containing address of the key data 3325 * @param t0 Register for input value t0 3326 * @param t1 Register for input value t1 3327 * @param t2 Register for input value t2 3328 * @param t3 Register for input value t3 3329 * @param a Register for output value 3330 * @param tmp1 Temporary register 1 3331 * @param tmp2 Temporary register 2 3332 */ 3333 void MacroAssembler::kernel_aescrypt_round(Register table_te, Register key, 3334 Register t0, Register t1, Register t2, Register t3, 3335 Register a, Register tmp1, Register tmp2) { 3336 3337 ldr(a, Address(post(key, 4))); // K 3338 uxtb(tmp1, t0, ror(24)); 3339 ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T1 3340 uxtb(tmp2, t1, ror(16)); 3341 eor(a, a, tmp1); 3342 ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T2 3343 uxtb(tmp1, t2, ror(8)); 3344 eor(a, a, tmp2, ror(8)); 3345 ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T3 3346 uxtb(tmp2, t3); 3347 eor(a, a, tmp1, ror(16)); 3348 ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T4 3349 eor(a, a, tmp2, ror(24)); // a0 3350 }; 3351 3352 /** 3353 * 3354 * Last AES encryption round ( 4 bytes ) 3355 * @param table_te 3356 * @param key 3357 * @param to 3358 * @param t0 3359 * @param t1 3360 * @param t2 3361 * @param t3 3362 * @param t4 3363 * @param t5 3364 * @param t6 3365 * @param t7 3366 * 3367 * int tt = K[keyOffset++]; 3368 * out[outOffset++] = (byte)(S[(t0 >>> 24) ] ^ (tt >>> 24)); 3369 * out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16)); 3370 * out[outOffset++] = (byte)(S[(t2 >>> 8) & 0xFF] ^ (tt >>> 8)); 3371 * out[outOffset++] = (byte)(S[(t3 ) & 0xFF] ^ (tt )); 3372 */ 3373 void MacroAssembler::kernel_aescrypt_lastRound( 3374 Register table_te, Register key, Register to, 3375 Register t0, Register t1, Register t2, Register t3, 3376 Register t4, Register t5, Register t6, Register t7) { 3377 3378 ldr(t7, Address(post(key, 4))); // tt 3379 3380 uxtb(t5, t0, ror(24)); 3381 ldr(t4, Address(table_te, t5, lsl(2))); // S[] 3382 uxtb(t6, t1, ror(16)); 3383 eor(t4, t4, t7, lsr(24)); 3384 ldr(t6, Address(table_te, t6, lsl(2))); // S[] 3385 uxtb(t5, t2, ror(8)); 3386 eor(t6, t6, t7, lsr(16)); 3387 uxtb(t6, t6); 3388 add(t4, t4, t6, lsl(8)); 3389 ldr(t5, Address(table_te, t5, lsl(2))); // S[] 3390 uxtb(t6, t3); 3391 eor(t5, t5, t7, lsr(8)); 3392 uxtb(t5, t5); 3393 add(t4, t4, t5, lsl(16)); 3394 ldr(t6, Address(table_te, t6, lsl(2))); // S[] 3395 eor(t6, t6, t7); 3396 uxtb(t6, t6); 3397 add(t4, t4, t6, lsl(24)); 3398 3399 str(t4, Address(post(to, 4))); 3400 3401 } 3402 3403 /** 3404 * 3405 * Last AES encryption round ( 4 bytes ) 3406 * @param table_te 3407 * @param key 3408 * @param to 3409 * @param t0 3410 * @param t1 3411 * @param t2 3412 * @param t3 3413 * @param t4 3414 * @param t5 3415 * @param t6 3416 * @param t7 3417 * 3418 * int tt = K[keyOffset++]; 3419 * out[outOffset++] = (byte)(S[(t0 >>> 24) ] ^ (tt >>> 24)); 3420 * out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16)); 3421 * out[outOffset++] = (byte)(S[(t2 >>> 8) & 0xFF] ^ (tt >>> 8)); 3422 * out[outOffset++] = (byte)(S[(t3 ) & 0xFF] ^ (tt )); 3423 */ 3424 void MacroAssembler::kernel_aescrypt_lastRound_cbc( 3425 Register table_te, 3426 Register t0, Register t1, Register t2, Register t3, 3427 Register t4, Register t5, Register t6) { 3428 3429 uxtb(t5, t0, ror(24)); 3430 ldr(t4, Address(table_te, t5, lsl(2))); // S[] 3431 uxtb(t6, t1, ror(16)); 3432 ldr(t6, Address(table_te, t6, lsl(2))); // S[] 3433 uxtb(t5, t2, ror(8)); 3434 add(t4, t4, t6, lsl(8)); 3435 ldr(t5, Address(table_te, t5, lsl(2))); // S[] 3436 uxtb(t6, t3); 3437 add(t4, t4, t5, lsl(16)); 3438 ldr(t6, Address(table_te, t6, lsl(2))); // S[] 3439 add(t4, t4, t6, lsl(24)); 3440 } 3441 3442 /** 3443 * AES ECB encryption 3444 * 3445 * @param from register pointing to source array address 3446 * @param to register pointing to destination array address 3447 * @param key register pointing to key 3448 * @param keylen register containing key len in bytes 3449 */ 3450 void MacroAssembler::kernel_aescrypt_encryptBlock(Register from, Register to, 3451 Register key, Register keylen, Register table_te, 3452 Register t0, Register t1, Register t2, Register t3, 3453 Register t4, Register t5, Register t6, Register t7) { 3454 Label L_loop; 3455 lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr())); 3456 3457 ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - 3458 arrayOopDesc::base_offset_in_bytes(T_INT))); 3459 3460 3461 kernel_aescrypt_firstRound(from, key, 3462 t0, t1, t2, t3, t4, t5, t6, t7); 3463 3464 sub(keylen, keylen, 8); 3465 BIND(L_loop); 3466 3467 kernel_aescrypt_round(table_te, key, 3468 t0, t1, t2, t3, t4, t7, from); 3469 kernel_aescrypt_round(table_te, key, 3470 t1, t2, t3, t0, t5, t7, from); 3471 kernel_aescrypt_round(table_te, key, 3472 t2, t3, t0, t1, t6, t7, from); 3473 3474 uxtb(t7, t3, ror(24)); 3475 ldr(t3, Address(table_te, t7, lsl(2))); // T1 3476 uxtb(t7, t0, ror(16)); 3477 ldr(t7, Address(table_te, t7, lsl(2))); // T2 3478 mov(t0, t4); // t0=a0 3479 eor(t3, t3, t7, ror(8)); 3480 uxtb(t7, t1, ror(8)); 3481 ldr(t7, Address(table_te, t7, lsl(2))); // T3 3482 mov(t1, t5); // t1=a1 3483 eor(t3, t3, t7, ror(16)); 3484 uxtb(t7, t2); 3485 ldr(t7, Address(table_te, t7, lsl(2))); // T4 3486 mov(t2, t6); // t2=a2 3487 eor(t3, t3, t7, ror(24)); 3488 ldr(t7, Address(post(key, 4))); // K 3489 eor(t3, t3, t7); // t3 = a3 3490 3491 subs(keylen, keylen, 4); 3492 b(L_loop, Assembler::NE); 3493 3494 // last round is special 3495 add(table_te, table_te, 4 * 256); //S 3496 3497 kernel_aescrypt_lastRound( 3498 table_te, key, to, 3499 t0, t1, t2, t3, 3500 t4, t5, t6, t7); 3501 3502 kernel_aescrypt_lastRound( 3503 table_te, key, to, 3504 t1, t2, t3, t0, 3505 t4, t5, t6, t7); 3506 3507 kernel_aescrypt_lastRound( 3508 table_te, key, to, 3509 t2, t3, t0, t1, 3510 t4, t5, t6, t7); 3511 3512 kernel_aescrypt_lastRound( 3513 table_te, key, to, 3514 t3, t0, t1, t2, 3515 t4, t5, t6, t7); 3516 } 3517 3518 /** 3519 * AES ECB decryption 3520 * @param from register pointing to source array address 3521 * @param to register pointing to destination array address 3522 * @param key register pointing to key 3523 * @param keylen register containing key len in bytes 3524 */ 3525 void MacroAssembler::kernel_aescrypt_decryptBlock(Register from, Register to, 3526 Register key, Register keylen, Register table_te, 3527 Register t0, Register t1, Register t2, Register t3, 3528 Register t4, Register t5, Register t6, Register t7) { 3529 Label L_loop; 3530 lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr())); 3531 3532 ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - 3533 arrayOopDesc::base_offset_in_bytes(T_INT))); 3534 3535 push(key, sp); 3536 3537 add(key, key, 16); 3538 kernel_aescrypt_firstRound(from, key, 3539 t0, t1, t2, t3, t4, t5, t6, t7); 3540 3541 sub(keylen, keylen, 8); 3542 BIND(L_loop); 3543 3544 kernel_aescrypt_round(table_te, key, 3545 t0, t3, t2, t1, t4, t7, from); 3546 kernel_aescrypt_round(table_te, key, 3547 t1, t0, t3, t2, t5, t7, from); 3548 kernel_aescrypt_round(table_te, key, 3549 t2, t1, t0, t3, t6, t7, from); 3550 3551 uxtb(t7, t3, ror(24)); 3552 ldr(t3, Address(table_te, t7, lsl(2))); // T1 3553 uxtb(t7, t2, ror(16)); 3554 ldr(t7, Address(table_te, t7, lsl(2))); // T2 3555 mov(t2, t6); // t2=a2 3556 eor(t3, t3, t7, ror(8)); 3557 uxtb(t7, t1, ror(8)); 3558 ldr(t7, Address(table_te, t7, lsl(2))); // T3 3559 mov(t1, t5); // t1=a1 3560 eor(t3, t3, t7, ror(16)); 3561 uxtb(t7, t0); 3562 ldr(t7, Address(table_te, t7, lsl(2))); // T4 3563 mov(t0, t4); // t0=a0 3564 eor(t3, t3, t7, ror(24)); 3565 ldr(t7, Address(post(key, 4))); // K 3566 eor(t3, t3, t7); // t3 = a3 3567 3568 subs(keylen, keylen, 4); 3569 b(L_loop, Assembler::NE); 3570 3571 pop(key, sp); 3572 // last round is special 3573 add(table_te, table_te, 4 * 256); //S 3574 3575 kernel_aescrypt_lastRound( 3576 table_te, key, to, 3577 t0, t3, t2, t1, 3578 t4, t5, t6, t7); 3579 3580 kernel_aescrypt_lastRound( 3581 table_te, key, to, 3582 t1, t0, t3, t2, 3583 t4, t5, t6, t7); 3584 3585 kernel_aescrypt_lastRound( 3586 table_te, key, to, 3587 t2, t1, t0, t3, 3588 t4, t5, t6, t7); 3589 3590 kernel_aescrypt_lastRound( 3591 table_te, key, to, 3592 t3, t2, t1, t0, 3593 t4, t5, t6, t7); 3594 } 3595 3596 /** 3597 * AES CBC encryption 3598 * 3599 * @param from register pointing to source array address 3600 * @param to register pointing to destination array address 3601 * @param key register pointing to key 3602 * @param rvec register pointing to roundkey vector 3603 * @param len register containing source len in bytes 3604 */ 3605 void MacroAssembler::kernel_aescrypt_encrypt(Register from, Register to, 3606 Register key, Register rvec, Register len, Register keylen, Register table_te, 3607 Register t0, Register t1, Register t2, Register t3, 3608 Register t4, Register t5, Register t6) { 3609 Label L_loop, L_loop2; 3610 lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr())); 3611 ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - 3612 arrayOopDesc::base_offset_in_bytes(T_INT))); 3613 3614 vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2 3615 vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1 3616 sub(keylen, keylen, 8); 3617 3618 add(t4, key, keylen, lsl(2)); 3619 vld1_64(d8, d9, Address(t4), Assembler::ALIGN_STD); // read last key bytes to q4 3620 vrev32_128_8(q4, q4); 3621 3622 push(to, sp); 3623 BIND(L_loop2); 3624 // get round key and first round 3625 vld1_64(d0, d1, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q0 3626 veor_128(q0, q0, q2); 3627 vrev32_128_8(q0, q0); 3628 veor_128(q0, q0, q1); 3629 vmov_f64(t0, t1, d0); 3630 vmov_f64(t2, t3, d1); 3631 3632 push(RegSet::of(key, from), sp); 3633 push(RegSet::of(to, keylen), sp); 3634 3635 BIND(L_loop); 3636 3637 kernel_aescrypt_round(table_te, key, 3638 t0, t1, t2, t3, t4, to, from); 3639 kernel_aescrypt_round(table_te, key, 3640 t1, t2, t3, t0, t5, to, from); 3641 kernel_aescrypt_round(table_te, key, 3642 t2, t3, t0, t1, t6, to, from); 3643 3644 uxtb(to, t3, ror(24)); 3645 ldr(t3, Address(table_te, to, lsl(2))); // T1 3646 uxtb(to, t0, ror(16)); 3647 ldr(to, Address(table_te, to, lsl(2))); // T2 3648 mov(t0, t4); // t0=a0 3649 eor(t3, t3, to, ror(8)); 3650 uxtb(to, t1, ror(8)); 3651 ldr(to, Address(table_te, to, lsl(2))); // T3 3652 mov(t1, t5); // t1=a1 3653 eor(t3, t3, to, ror(16)); 3654 uxtb(to, t2); 3655 ldr(to, Address(table_te, to, lsl(2))); // T4 3656 mov(t2, t6); // t2=a2 3657 eor(t3, t3, to, ror(24)); 3658 ldr(to, Address(post(key, 4))); // K 3659 eor(t3, t3, to); // t3 = a3 3660 3661 subs(keylen, keylen, 4); 3662 b(L_loop, Assembler::NE); 3663 3664 // last round is special 3665 add(table_te, table_te, 4 * 256); //S 3666 kernel_aescrypt_lastRound_cbc( 3667 table_te, 3668 t0, t1, t2, t3, 3669 t4, t5, t6); 3670 3671 kernel_aescrypt_lastRound_cbc( 3672 table_te, 3673 t1, t2, t3, t0, 3674 t5, t6, from); 3675 vmov_f64(d6, t4, t5); 3676 3677 kernel_aescrypt_lastRound_cbc( 3678 table_te, 3679 t2, t3, t0, t1, 3680 t4, t5, t6); 3681 3682 kernel_aescrypt_lastRound_cbc( 3683 table_te, 3684 t3, t0, t1, t2, 3685 t5, t6, from); 3686 vmov_f64(d7, t4, t5); 3687 veor_128(q2, q4, q3); 3688 3689 pop(RegSet::of(to, keylen), sp); 3690 sub(table_te, table_te, 4 * 256); //Te 3691 vst1_64(d4, Address(post(to, 8)), Assembler::ALIGN_STD); 3692 pop(RegSet::of(key, from), sp); 3693 vst1_64(d5, Address(post(to, 8)), Assembler::ALIGN_STD); 3694 3695 subs(len, len, 16); 3696 b(L_loop2, Assembler::NE); 3697 vstr_f64(d4, Address(rvec)); 3698 vstr_f64(d5, Address(rvec, 8)); 3699 mov(r0, to); 3700 pop(to, sp); 3701 sub(r0, r0, to); 3702 }; 3703 3704 /** 3705 * AES CBC decryption 3706 * 3707 * @param from register pointing to source array address 3708 * @param to register pointing to destination array address 3709 * @param key register pointing to key 3710 * @param rvec register pointing to roundkey vector 3711 * @param len register containing source len in bytes 3712 */ 3713 void MacroAssembler::kernel_aescrypt_decrypt(Register from, Register to, 3714 Register key, Register rvec, Register len, Register keylen, Register table_te, 3715 Register t0, Register t1, Register t2, Register t3, 3716 Register t4, Register t5, Register t6) { 3717 Label L_loop, L_loop2; 3718 lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr())); 3719 3720 ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - 3721 arrayOopDesc::base_offset_in_bytes(T_INT))); 3722 3723 vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1 3724 vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2 3725 vld1_64(d10, d11, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q5 3726 vrev32_128_8(q1, q1); 3727 sub(keylen, keylen, 8); 3728 3729 push(to, sp); 3730 BIND(L_loop2); 3731 // get round key and first round 3732 vld1_64(d8, d9, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q4 3733 3734 push(RegSet::of(to, key, from, keylen), sp); 3735 vrev32_128_8(q0, q4); 3736 veor_128(q0, q0, q5); 3737 vmov_f64(t0, t1, d0); 3738 vmov_f64(t2, t3, d1); 3739 3740 BIND(L_loop); 3741 3742 kernel_aescrypt_round(table_te, key, 3743 t0, t3, t2, t1, t4, to, from); 3744 kernel_aescrypt_round(table_te, key, 3745 t1, t0, t3, t2, t5, to, from); 3746 kernel_aescrypt_round(table_te, key, 3747 t2, t1, t0, t3, t6, to, from); 3748 3749 uxtb(to, t3, ror(24)); 3750 ldr(t3, Address(table_te, to, lsl(2))); // T1 3751 uxtb(to, t2, ror(16)); 3752 ldr(to, Address(table_te, to, lsl(2))); // T2 3753 mov(t2, t6); // t2=a2 3754 eor(t3, t3, to, ror(8)); 3755 uxtb(to, t1, ror(8)); 3756 ldr(to, Address(table_te, to, lsl(2))); // T3 3757 mov(t1, t5); // t1=a1 3758 eor(t3, t3, to, ror(16)); 3759 uxtb(to, t0); 3760 ldr(to, Address(table_te, to, lsl(2))); // T4 3761 mov(t0, t4); // t0=a0 3762 eor(t3, t3, to, ror(24)); 3763 ldr(to, Address(post(key, 4))); // K 3764 eor(t3, t3, to); // t3 = a3 3765 3766 subs(keylen, keylen, 4); 3767 b(L_loop, Assembler::NE); 3768 3769 // last round is special 3770 add(table_te, table_te, 4 * 256); //S 3771 3772 kernel_aescrypt_lastRound_cbc( 3773 table_te, 3774 t0, t3, t2, t1, 3775 t4, t5, t6); 3776 3777 kernel_aescrypt_lastRound_cbc( 3778 table_te, 3779 t1, t0, t3, t2, 3780 t5, t6, to); 3781 vmov_f64(d6, t4, t5); //q3 3782 3783 kernel_aescrypt_lastRound_cbc( 3784 table_te, 3785 t2, t1, t0, t3, 3786 t4, t5, t6); 3787 3788 kernel_aescrypt_lastRound_cbc( 3789 table_te, 3790 t3, t2, t1, t0, 3791 t5, t6, to); 3792 vmov_f64(d7, t4, t5); //q3 3793 pop(RegSet::of(to, key, from, keylen), sp); 3794 veor_128(q3, q1, q3); 3795 veor_128(q3, q3, q2); 3796 vshl_128_64(q2, q4, 0); 3797 3798 sub(table_te, table_te, 4 * 256); //Te 3799 3800 vst1_64(d6, Address(post(to, 8)), Assembler::ALIGN_STD); 3801 subs(len, len, 16); 3802 vst1_64(d7, Address(post(to, 8)), Assembler::ALIGN_STD); 3803 3804 b(L_loop2, Assembler::NE); 3805 3806 vstr_f64(d4, Address(rvec)); 3807 vstr_f64(d5, Address(rvec, 8)); 3808 mov(r0, to); 3809 pop(to, sp); 3810 sub(r0, r0, to); 3811 }; 3812 3813 /* 3814 * First round of SHA1 algorithm 3815 */ 3816 void MacroAssembler::sha_round1(Register st_b, Register st_c, Register st_d, 3817 Register tmp, Register st_f, int sh) { 3818 if (sh) { 3819 eor(st_f, st_d, st_c, ror(32-sh)); 3820 } else { 3821 eor(st_f, st_d, st_c); 3822 } 3823 andr(st_f, st_f, st_b); 3824 eor(st_f, st_f, st_d); 3825 } 3826 3827 /* 3828 * Second and forth round of SHA1 algorithm 3829 */ 3830 void MacroAssembler::sha_round2(Register st_b, Register st_c, Register st_d, 3831 Register tmp, Register st_f, int sh) { 3832 if (sh) { 3833 eor(st_f, st_b, st_c, ror(32-sh)); 3834 } else { 3835 eor(st_f, st_b, st_c); 3836 } 3837 eor(st_f, st_f, st_d); 3838 } 3839 3840 /* 3841 * Third round of SHA1 algorithm 3842 */ 3843 void MacroAssembler::sha_round3(Register st_b, Register st_c, Register st_d, 3844 Register tmp, Register st_f, int sh) { 3845 if (sh) { 3846 andr(st_f, st_b, st_c, ror(32-sh)); 3847 orr(tmp, st_b, st_c, ror(32-sh)); 3848 } else { 3849 andr(st_f, st_b, st_c); 3850 orr(tmp, st_b, st_c); 3851 } 3852 andr(tmp, st_d, tmp); 3853 orr(st_f, st_f, tmp); 3854 } 3855 3856 /* 3857 * Calculate Deltas w[i] and w[i+1] 3858 * w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) rotl 1 3859 */ 3860 void MacroAssembler::sha_w0(FloatRegister w16, FloatRegister w14, 3861 FloatRegister w8, FloatRegister w4, FloatRegister w2, 3862 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4, 3863 FloatRegister st_k, FloatRegister st_kw, bool update) { 3864 vadd_64_32(st_kw, st_k, w16); 3865 if(update) { 3866 veor_64(tmp1, w16, w14); 3867 vext_64(tmp2, w2, w4, 4); 3868 veor_64(tmp3, tmp1, w8); 3869 veor_64(tmp4, tmp3, tmp2); 3870 3871 vshr_64_u32(tmp1, tmp4, 31); 3872 vshl_64_32(tmp2, tmp4, 1); 3873 vorr_64(w16, tmp1, tmp2); 3874 } 3875 } 3876 /* 3877 * Calculate Deltas w[i] and w[i+1] 3878 */ 3879 void MacroAssembler::sha_w(FloatRegister w16, FloatRegister w14, 3880 FloatRegister w12, FloatRegister w10, FloatRegister w8, 3881 FloatRegister w6, FloatRegister w4, FloatRegister w2, 3882 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4, 3883 FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp, 3884 bool update) { 3885 Label L_7, L_6, L_5, L_4, L_3, L_2, L_1, L_done; 3886 andr(rtmp, counter, 0x7); 3887 add(counter, counter, 1); 3888 cmp(rtmp, 7); 3889 b(L_7, Assembler::EQ); 3890 cmp(rtmp, 6); 3891 b(L_6, Assembler::EQ); 3892 cmp(rtmp, 5); 3893 b(L_5, Assembler::EQ); 3894 cmp(rtmp, 4); 3895 b(L_4, Assembler::EQ); 3896 cmp(rtmp, 3); 3897 b(L_3, Assembler::EQ); 3898 cmp(rtmp, 2); 3899 b(L_2, Assembler::EQ); 3900 cmp(rtmp, 1); 3901 b(L_1, Assembler::EQ); 3902 sha_w0(w16, w14, w8, w4, w2, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3903 b(L_done); 3904 BIND(L_1); { 3905 sha_w0(w14, w12, w6, w2, w16, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3906 b(L_done); 3907 } 3908 BIND(L_2); { 3909 sha_w0(w12, w10, w4, w16, w14, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3910 b(L_done); 3911 } 3912 BIND(L_3); { 3913 sha_w0(w10, w8, w2, w14, w12, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3914 b(L_done); 3915 } 3916 BIND(L_4); { 3917 sha_w0(w8, w6, w16, w12, w10, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3918 b(L_done); 3919 } 3920 BIND(L_5); { 3921 sha_w0(w6, w4, w14, w10, w8, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3922 b(L_done); 3923 } 3924 BIND(L_6); { 3925 sha_w0(w4, w2, w12, w8, w6, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3926 b(L_done); 3927 } 3928 BIND(L_7); { 3929 sha_w0(w2, w16, w10, w6, w4, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update); 3930 } 3931 BIND(L_done); 3932 } 3933 3934 /** 3935 * SHA1 digest 3936 * 3937 * @param from register pointing to source array address 3938 * @param state register pointing to state array address 3939 */ 3940 void MacroAssembler::kernel_sha_implCompress(Register from, Register state, 3941 Register counter, Register table_k, 3942 Register st_a, Register st_b, 3943 Register st_c, Register st_d, Register st_e, 3944 Register tmp, Register counter2, Register st_new_a, Register st_w) { 3945 Label L_round_1, L_round_2, L_round_3, L_round_4, L_round_4_cont, L_hash_no_w; 3946 3947 FloatRegister w16 = d0; //q0-q7 3948 FloatRegister w14 = w16->successor(FloatRegisterImpl::DOUBLE); 3949 FloatRegister w12 = w14->successor(FloatRegisterImpl::DOUBLE); 3950 FloatRegister w10 = w12->successor(FloatRegisterImpl::DOUBLE); 3951 FloatRegister w8 = w10->successor(FloatRegisterImpl::DOUBLE); 3952 FloatRegister w6 = w8->successor(FloatRegisterImpl::DOUBLE); 3953 FloatRegister w4 = w6->successor(FloatRegisterImpl::DOUBLE); 3954 FloatRegister w2 = w4->successor(FloatRegisterImpl::DOUBLE); 3955 FloatRegister wtmp1 = w2->successor(FloatRegisterImpl::DOUBLE); 3956 FloatRegister wtmp2 = wtmp1->successor(FloatRegisterImpl::DOUBLE); 3957 FloatRegister wtmp3 = wtmp2->successor(FloatRegisterImpl::DOUBLE); 3958 FloatRegister wtmp4 = wtmp3->successor(FloatRegisterImpl::DOUBLE); 3959 FloatRegister st_k1 = wtmp4->successor(FloatRegisterImpl::DOUBLE); 3960 FloatRegister st_k2 = st_k1->successor(FloatRegisterImpl::DOUBLE); 3961 FloatRegister st_k = st_k2->successor(FloatRegisterImpl::DOUBLE); 3962 FloatRegister st_kw = st_k->successor(FloatRegisterImpl::DOUBLE); 3963 3964 3965 assert_different_registers(st_a,st_b,st_c,st_d,st_e,tmp,counter2, st_new_a, st_w); 3966 assert_different_registers(w2,w4,w6,w8,w10,w12,w14,w16); 3967 3968 lea(table_k, ExternalAddress(StubRoutines::sha1_table_addr())); 3969 3970 // read initial 16 W elements 3971 vld1_64(w16, w14, w12, w10, Address(post(from, 32)), Assembler::ALIGN_STD); 3972 vld1_64(w8, w6, w4, w2, Address(from), Assembler::ALIGN_STD); 3973 3974 // revert W 3975 vrev64_128_8(w16, w16); 3976 vrev64_128_8(w12, w12); 3977 vrev64_128_8(w8, w8); 3978 vrev64_128_8(w4, w4); 3979 // load state 3980 ldr(st_a, Address(post(state, 4))); 3981 ldr(st_b, Address(post(state, 4))); 3982 ldr(st_c, Address(post(state, 4))); 3983 ldr(st_d, Address(post(state, 4))); 3984 ldr(st_e, Address(state)); 3985 sub(state, state, 16); 3986 3987 mov(counter2, 0); 3988 mov(counter, 10); 3989 // first round 3990 vld1_64(st_k1, st_k2, Address(table_k), Assembler::ALIGN_128); 3991 vdup_64_32(st_k, st_k1, 0); 3992 3993 BIND(L_round_1); { 3994 sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); 3995 3996 sha_round1(st_b, st_c, st_d, tmp, st_new_a, 0); 3997 vmov_32(st_w, st_kw, 1); 3998 add(st_new_a, st_new_a, st_a, ror(32-5)); 3999 add(st_new_a, st_new_a, st_e); 4000 add(st_new_a, st_new_a, st_w); 4001 4002 vmov_32(st_w, st_kw, 0); 4003 sha_round1(st_a, st_b, st_c, tmp, st_e, 30); 4004 4005 add(tmp, st_e, st_new_a, ror(32-5)); 4006 add(tmp, tmp, st_d); 4007 4008 mov(st_e, st_c); 4009 mov(st_d, st_b, ror(32-30)); 4010 mov(st_c, st_a, ror(32-30)); 4011 mov(st_b, st_new_a); 4012 add(st_a, tmp, st_w); 4013 4014 sub(counter, counter, 1); 4015 }cbnz(counter, L_round_1); 4016 4017 mov(counter, 10); 4018 // second round 4019 vdup_64_32(st_k, st_k1, 1); 4020 4021 BIND(L_round_2); { 4022 sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); 4023 4024 sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0); 4025 vmov_32(st_w, st_kw, 1); 4026 add(st_new_a, st_new_a, st_a, ror(32-5)); 4027 add(st_new_a, st_new_a, st_e); 4028 add(st_new_a, st_new_a, st_w); 4029 4030 vmov_32(st_w, st_kw, 0); 4031 sha_round2(st_a, st_b, st_c, tmp, st_e, 30); 4032 4033 add(tmp, st_e, st_new_a, ror(32-5)); 4034 add(tmp, tmp, st_d); 4035 4036 mov(st_e, st_c); 4037 mov(st_d, st_b, ror(32-30)); 4038 mov(st_c, st_a, ror(32-30)); 4039 mov(st_b, st_new_a); 4040 add(st_a, tmp, st_w); 4041 4042 sub(counter, counter, 1); 4043 }cbnz(counter, L_round_2); 4044 4045 mov(counter, 10); 4046 vdup_64_32(st_k, st_k2, 0); 4047 // third round 4048 4049 BIND(L_round_3); { 4050 sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); 4051 4052 sha_round3(st_b, st_c, st_d, tmp, st_new_a, 0); 4053 vmov_32(st_w, st_kw, 1); 4054 add(st_new_a, st_new_a, st_a, ror(32-5)); 4055 add(st_new_a, st_new_a, st_e); 4056 add(st_new_a, st_new_a, st_w); 4057 4058 vmov_32(st_w, st_kw, 0); 4059 sha_round3(st_a, st_b, st_c, tmp, st_e, 30); 4060 4061 add(tmp, st_e, st_new_a, ror(32-5)); 4062 add(tmp, tmp, st_d); 4063 4064 mov(st_e, st_c); 4065 mov(st_d, st_b, ror(32-30)); 4066 mov(st_c, st_a, ror(32-30)); 4067 mov(st_b, st_new_a); 4068 add(st_a, tmp, st_w); 4069 4070 sub(counter, counter, 1); 4071 }cbnz(counter, L_round_3); 4072 4073 mov(counter, 10); 4074 // forth round 4075 vdup_64_32(st_k, st_k2, 1); 4076 4077 BIND(L_round_4); { 4078 sub(counter, counter, 1); 4079 cmp(counter, 8); 4080 b(L_hash_no_w, Assembler::LO); 4081 sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp); 4082 b(L_round_4_cont); 4083 BIND(L_hash_no_w); 4084 sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp, false); 4085 BIND(L_round_4_cont); 4086 4087 sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0); 4088 vmov_32(st_w, st_kw, 1); 4089 add(st_new_a, st_new_a, st_a, ror(32-5)); 4090 add(st_new_a, st_new_a, st_e); 4091 add(st_new_a, st_new_a, st_w); 4092 4093 vmov_32(st_w, st_kw, 0); 4094 sha_round2(st_a, st_b, st_c, tmp, st_e, 30); 4095 4096 add(tmp, st_e, st_new_a, ror(32-5)); 4097 add(tmp, tmp, st_d); 4098 4099 mov(st_e, st_c); 4100 mov(st_d, st_b, ror(32-30)); 4101 mov(st_c, st_a, ror(32-30)); 4102 mov(st_b, st_new_a); 4103 add(st_a, tmp, st_w); 4104 4105 }cbnz(counter, L_round_4); 4106 4107 // load state 4108 ldr(tmp, Address(post(state, 4))); 4109 add(st_a, st_a, tmp); 4110 ldr(tmp, Address(post(state, 4))); 4111 add(st_b, st_b, tmp); 4112 ldr(tmp, Address(post(state, 4))); 4113 add(st_c, st_c, tmp); 4114 ldr(tmp, Address(post(state, 4))); 4115 add(st_d, st_d, tmp); 4116 ldr(tmp, Address(state)); 4117 add(st_e, st_e, tmp); 4118 sub(state, state, 16); 4119 4120 // save state 4121 str(st_a, Address(post(state, 4))); 4122 str(st_b, Address(post(state, 4))); 4123 str(st_c, Address(post(state, 4))); 4124 str(st_d, Address(post(state, 4))); 4125 str(st_e, Address(state)); 4126 } 4127 /** 4128 * One iteration of SHA256 algorithm 4129 * Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22) 4130 * Ma := (a and b) xor (a and c) xor (b and c) 4131 * t2 := Σ0 + Ma 4132 * Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25) 4133 * Ch := (e and f) xor ((not e) and g) 4134 * t1 := h + Σ1 + Ch + k[i] + w[i] 4135 * h := g 4136 * g := f 4137 * f := e 4138 * e := d + t1 4139 * d := c 4140 * c := b 4141 * b := a 4142 * a := t1 + t2 4143 */ 4144 void MacroAssembler::sha256_implCompress_iter0( 4145 Register Da, Register Db, Register Dc, Register Dd, 4146 Register De, Register Df, Register Dg, Register Dh, 4147 FloatRegister Dkw, int index, 4148 Register Dtmp, 4149 Register Dnew_a, Register Dnew_e 4150 ) { 4151 assert_different_registers(Da, Db, Dc, Dd, De, Df, Dg, Dh); 4152 4153 // Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22) 4154 // Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25) 4155 andr(Dnew_a, Da, Db); 4156 andr(Dnew_e, Da, Dc); 4157 eor(Dnew_a, Dnew_a, Dnew_e); 4158 andr(Dnew_e, Db, Dc); 4159 eor(Dnew_e, Dnew_a, Dnew_e); //Ma 4160 4161 mov(Dnew_a, Da, ror(2)); 4162 eor(Dnew_a, Dnew_a, Da, ror(13)); 4163 eor(Dnew_a, Dnew_a, Da, ror(22)); //Σ0 4164 4165 add(Dnew_a, Dnew_a, Dnew_e); //t2 4166 4167 andr(Dnew_e, De, Df); 4168 mvn(Dtmp, De); 4169 andr(Dtmp, Dtmp, Dg); 4170 eor(Dtmp, Dnew_e, Dtmp); //Ch 4171 4172 mov(Dnew_e, De, ror(6)); 4173 eor(Dnew_e, Dnew_e, De, ror(11)); 4174 eor(Dnew_e, Dnew_e, De, ror(25)); //Σ1 4175 4176 add(Dnew_e, Dnew_e, Dtmp); 4177 vmov_32(Dtmp, Dkw, index); 4178 add(Dnew_e, Dnew_e, Dh); 4179 4180 add(Dtmp, Dnew_e, Dtmp); //t1 4181 4182 add(Dnew_e, Dtmp, Dd); //new_e 4183 add(Dnew_a, Dtmp, Dnew_a); //new_a 4184 }; 4185 /** 4186 * Four iterations of SHA256 algorithm 4187 */ 4188 void MacroAssembler::sha256_implCompress_iter( 4189 Register ra, Register rb, Register rc, Register rd, 4190 Register re, Register rf, Register rg, Register rh, 4191 FloatRegister Dkw1, FloatRegister Dkw2, 4192 Register step, 4193 Register tmp, 4194 Register ra2, Register re2 4195 ) { 4196 Label L_4, L_3, L_2, L_1, L_done; 4197 cmp(step, 4); 4198 b(L_4, Assembler::EQ); 4199 cmp(step, 3); 4200 b(L_3, Assembler::EQ); 4201 cmp(step, 2); 4202 b(L_2, Assembler::EQ); 4203 cmp(step, 1); 4204 b(L_1, Assembler::EQ); 4205 sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw1, 0, tmp, ra2, re2); 4206 sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw1, 1, tmp, rd, rh); 4207 sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw2, 0, tmp, rc, rg); 4208 sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw2, 1, tmp, rb, rf); 4209 mov(step, 4); 4210 b(L_done); 4211 BIND(L_1); { 4212 sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw1, 0, tmp, rd, rh); 4213 sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw1, 1, tmp, rc, rg); 4214 sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw2, 0, tmp, rb, rf); 4215 sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw2, 1, tmp, ra, re); 4216 mov(step, 0); 4217 b(L_done); 4218 } 4219 BIND(L_2); { 4220 sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw1, 0, tmp, rc, rg); 4221 sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw1, 1, tmp, rb, rf); 4222 sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw2, 0, tmp, ra, re); 4223 sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw2, 1, tmp, ra2, re2); 4224 mov(step, 1); 4225 b(L_done); 4226 } 4227 BIND(L_3); { 4228 sha256_implCompress_iter0(rc, rd, ra2, ra, rg, rh, re2, re, Dkw1, 0, tmp, rb, rf); 4229 sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw1, 1, tmp, ra, re); 4230 sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw2, 0, tmp, ra2, re2); 4231 sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw2, 1, tmp, rd, rh); 4232 mov(step, 2); 4233 b(L_done); 4234 } 4235 BIND(L_4); { 4236 sha256_implCompress_iter0(rb, rc, rd, ra2, rf, rg, rh, re2, Dkw1, 0, tmp, ra, re); 4237 sha256_implCompress_iter0(ra, rb, rc, rd, re, rf, rg, rh, Dkw1, 1, tmp, ra2, re2); 4238 sha256_implCompress_iter0(ra2, ra, rb, rc, re2, re, rf, rg, Dkw2, 0, tmp, rd, rh); 4239 sha256_implCompress_iter0(rd, ra2, ra, rb, rh, re2, re, rf, Dkw2, 1, tmp, rc, rg); 4240 mov(step, 3); 4241 } 4242 BIND(L_done); 4243 }; 4244 4245 /* 4246 * Calculate Deltas w[i] and w[i+1] 4247 * s0 := (w[i-15] rotr 7) xor (w[i-15] rotr 18) xor (w[i-15] shr 3) 4248 * s1 := (w[i-2] rotr 17) xor (w[i-2] rotr 19) xor (w[i-2] shr 10) 4249 * w[i] := w[i-16] + s0 + w[i-7] + s1 4250 */ 4251 void MacroAssembler::sha256_w0( 4252 FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14, 4253 FloatRegister w_m7, FloatRegister w_m6, 4254 FloatRegister w_m2, 4255 FloatRegister Qtmp_S0, FloatRegister Qtmp_S1, 4256 FloatRegister Qtmp1){ 4257 4258 vmov_64(Qtmp1, w_m15); 4259 vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m14); 4260 vshr_128_u64(Qtmp_S0, Qtmp1, 7); 4261 vshr_128_u64(Qtmp_S1, Qtmp1, 18); 4262 veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1); 4263 vshr_128_u64(Qtmp_S1, Qtmp1, 35); 4264 veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1); //S0 4265 4266 vshr_128_u64(Qtmp_S1, w_m2, 17); 4267 vshr_128_u64(Qtmp1, w_m2, 19); 4268 veor_128(Qtmp_S1, Qtmp_S1, Qtmp1); 4269 vshr_128_u64(Qtmp1, w_m2, 42); 4270 veor_128(Qtmp_S1, Qtmp_S1, Qtmp1); //S1 4271 4272 vmov_64(Qtmp1, w_m7); 4273 vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m6); 4274 vadd_128_32(Qtmp1, Qtmp1, w_m16); 4275 vadd_128_32(Qtmp1, Qtmp1, Qtmp_S0); 4276 vadd_128_32(w_m16, Qtmp1, Qtmp_S1); // w[i/i+1] 4277 4278 vdup_64_32(w_m16, w_m16, 0); 4279 vdup_64_32(w_m15, w_m15, 0); 4280 } 4281 4282 /* 4283 * Calculate Deltas w[i] ... w[i+3] 4284 */ 4285 void MacroAssembler::sha256_w(FloatRegister w16, FloatRegister w14, 4286 FloatRegister w12, FloatRegister w10, FloatRegister w8, 4287 FloatRegister w6, FloatRegister w4, FloatRegister w2, 4288 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, 4289 FloatRegister st_kw, Register counter, Register rtmp) { 4290 FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE); 4291 FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE); 4292 FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE); 4293 FloatRegister w9 = w10->successor(FloatRegisterImpl::DOUBLE); 4294 FloatRegister w7 = w8->successor(FloatRegisterImpl::DOUBLE); 4295 FloatRegister w5 = w6->successor(FloatRegisterImpl::DOUBLE); 4296 FloatRegister w3 = w4->successor(FloatRegisterImpl::DOUBLE); 4297 FloatRegister w1 = w2->successor(FloatRegisterImpl::DOUBLE); 4298 4299 FloatRegister Dtmp1 = as_FloatRegister(tmp1->encoding()); 4300 FloatRegister Dtmp2 = Dtmp1->successor(FloatRegisterImpl::DOUBLE); 4301 Label L_3, L_2, L_1, L_done; 4302 4303 andr(rtmp, counter, 0x3); 4304 cmp(rtmp, 3); 4305 b(L_3, Assembler::EQ); 4306 cmp(rtmp, 2); 4307 b(L_2, Assembler::EQ); 4308 cmp(rtmp, 1); 4309 b(L_1, Assembler::EQ); 4310 vext_64(Dtmp1, w16, w15, 4); 4311 vext_64(Dtmp2, w14, w13, 4); 4312 vadd_128_32(st_kw, st_kw, tmp1); 4313 cmp(counter, 3); 4314 b(L_done, Assembler::LO); 4315 sha256_w0(w16, w15, w14, w7, w6, w2, tmp1, tmp2, tmp3); 4316 sha256_w0(w14, w13, w12, w5, w4, w16, tmp1, tmp2, tmp3); 4317 b(L_done); 4318 BIND(L_3); { 4319 vext_64(Dtmp1, w12, w11, 4); 4320 vext_64(Dtmp2, w10, w9, 4); 4321 vadd_128_32(st_kw, st_kw, tmp1); 4322 cmp(counter, 3); 4323 b(L_done, Assembler::LO); 4324 sha256_w0(w12, w11, w10, w3, w2, w14, tmp1, tmp2, tmp3); 4325 sha256_w0(w10, w9, w8, w1, w16, w12, tmp1, tmp2, tmp3); 4326 b(L_done); 4327 } 4328 BIND(L_2); { 4329 vext_64(Dtmp1, w8, w7, 4); 4330 vext_64(Dtmp2, w6, w5, 4); 4331 vadd_128_32(st_kw, st_kw, tmp1); 4332 cmp(counter, 3); 4333 b(L_done, Assembler::LO); 4334 sha256_w0(w8, w7, w6, w15, w14, w10, tmp1, tmp2, tmp3); 4335 sha256_w0(w6, w5, w4, w13, w12, w8, tmp1, tmp2, tmp3); 4336 b(L_done); 4337 } 4338 BIND(L_1); { 4339 vext_64(Dtmp1, w4, w3, 4); 4340 vext_64(Dtmp2, w2, w1, 4); 4341 vadd_128_32(st_kw, st_kw, tmp1); 4342 cmp(counter, 3); 4343 b(L_done, Assembler::LO); 4344 sha256_w0(w4, w3, w2, w11, w10, w6, tmp1, tmp2, tmp3); 4345 sha256_w0(w2, w1, w16, w9, w8, w4, tmp1, tmp2, tmp3); 4346 } 4347 BIND(L_done); 4348 } 4349 4350 /** 4351 * SHA256 digest 4352 * 4353 * @param from register pointing to source array address 4354 * @param state register pointing to state array address 4355 */ 4356 void MacroAssembler::kernel_sha256_implCompress(Register from, Register state, 4357 Register counter, Register table_k, 4358 Register ra, Register rb, Register rc, Register rd, Register re, 4359 Register rf, Register rg, Register rh, 4360 Register ra2, Register re2) { 4361 4362 Label L_hash_loop, L_hash_loop_done, L_hash_no_w; 4363 lea(table_k, ExternalAddress(StubRoutines::sha256_table_addr())); 4364 4365 // read next k 4366 vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128); 4367 // read initial 16 W elements in q8-q11 4368 vld1_64(d16, d17, d18, d19, Address(post(from, 32)), Assembler::ALIGN_STD); // read from 4369 vld1_64(d20, d21, d22, d23, Address(post(from, 32)), Assembler::ALIGN_STD); // read from 4370 // revert W 4371 vrev32_128_8(q8, q8); 4372 vrev32_128_8(q9, q9); 4373 vrev32_128_8(q10, q10); 4374 vrev32_128_8(q11, q11); 4375 4376 vadd_128_32(q7, q7, q8); // k + w 4377 4378 vdup_64_32(d31, d23, 1); //w1 4379 vdup_64_32(d30, d23, 0); //w2 4380 vdup_64_32(d29, d22, 1); //w3 4381 vdup_64_32(d28, d22, 0); //w4 4382 vdup_64_32(d27, d21, 1); //w5 4383 vdup_64_32(d26, d21, 0); //w6 4384 vdup_64_32(d25, d20, 1); //w7 4385 vdup_64_32(d24, d20, 0); //w8 4386 vdup_64_32(d23, d19, 1); //w9 4387 vdup_64_32(d22, d19, 0); //w10 4388 vdup_64_32(d21, d18, 1); //w11 4389 vdup_64_32(d20, d18, 0); //w12 4390 vdup_64_32(d19, d17, 1); //w13 4391 vdup_64_32(d18, d17, 0); //w14 4392 vdup_64_32(d17, d16, 1); //w15 4393 vdup_64_32(d16, d16, 0); //w16 4394 4395 mov(counter, 16); 4396 // load state 4397 push(state, sp); 4398 ldr(ra, Address(post(state, 4))); 4399 ldr(rb, Address(post(state, 4))); 4400 ldr(rc, Address(post(state, 4))); 4401 ldr(rd, Address(post(state, 4))); 4402 ldr(re, Address(post(state, 4))); 4403 ldr(rf, Address(post(state, 4))); 4404 ldr(rg, Address(post(state, 4))); 4405 ldr(rh, Address(state)); 4406 4407 const Register tmp = from; 4408 const Register step = state; 4409 4410 // calculate deltas 4411 sha256_w0(d16, d17, d18, d25, d26, d30, q0, q1, q2); 4412 sha256_w0(d18, d19, d20, d27, d28, d16, q0, q1, q2); 4413 4414 mov(step, 0); // use state for internal counter 4415 sub(counter, counter, 1); 4416 4417 sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15, 4418 step, 4419 tmp, ra2, re2); 4420 4421 BIND(L_hash_loop); { 4422 // read next k 4423 vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128); 4424 //calculate deltas 4425 sha256_w(q8, q9, q10, q11, q12, q13, q14, q15, 4426 q0, q1, q2, 4427 q7, 4428 counter, tmp); 4429 4430 //calculate state 4431 sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15, 4432 step, 4433 tmp, ra2, re2); 4434 sub(counter, counter, 1); 4435 } cbnz(counter, L_hash_loop); 4436 4437 pop(state, sp); 4438 4439 // load initial state and add to current state 4440 ldr(tmp, Address(post(state, 4))); 4441 add(rb, rb, tmp); 4442 ldr(tmp, Address(post(state, 4))); 4443 add(rc, rc, tmp); 4444 ldr(tmp, Address(post(state, 4))); 4445 add(rd, rd, tmp); 4446 ldr(tmp, Address(post(state, 4))); 4447 add(ra2, ra2, tmp); 4448 ldr(tmp, Address(post(state, 4))); 4449 add(rf, rf, tmp); 4450 ldr(tmp, Address(post(state, 4))); 4451 add(rg, rg, tmp); 4452 ldr(tmp, Address(post(state, 4))); 4453 add(rh, rh, tmp); 4454 ldr(tmp, Address(state)); 4455 add(re2, re2, tmp); 4456 sub(state, state, 28); 4457 4458 // save state 4459 str(rb, Address(post(state, 4))); 4460 str(rc, Address(post(state, 4))); 4461 str(rd, Address(post(state, 4))); 4462 str(ra2, Address(post(state, 4))); 4463 str(rf, Address(post(state, 4))); 4464 str(rg, Address(post(state, 4))); 4465 str(rh, Address(post(state, 4))); 4466 str(re2, Address(post(state, 4))); 4467 } 4468 4469 /** 4470 * SHA512 Sigma 4471 * Sigma(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR ROTR(x, sh3) 4472 */ 4473 void MacroAssembler::sha512_sigma(FloatRegister x, 4474 FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3) { 4475 FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding()); 4476 FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE); 4477 assert_different_registers(x, Dtmp0, Dtmp1, Dsigma); 4478 4479 vshr_64_u64(Dtmp0, x, sh1); 4480 vshl_64_64(Dtmp1, x, 64-sh1); 4481 vorr_64(Dsigma, Dtmp0, Dtmp1); 4482 4483 vshr_64_u64(Dtmp0, x, sh2); 4484 vshl_64_64(Dtmp1, x, 64-sh2); 4485 vorr_64(Dtmp0, Dtmp0, Dtmp1); 4486 4487 veor_64(Dsigma, Dsigma, Dtmp0); 4488 4489 vshr_64_u64(Dtmp0, x, sh3); 4490 vshl_64_64(Dtmp1, x, 64-sh3); 4491 vorr_64(Dtmp0, Dtmp0, Dtmp1); 4492 4493 veor_64(Dsigma, Dsigma, Dtmp0); 4494 } 4495 4496 /** 4497 * SHA512 Delta 4498 * Delta(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR SHR(x, sh3) 4499 */ 4500 void MacroAssembler::sha512_delta(FloatRegister x, 4501 FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3) { 4502 FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding()); 4503 FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE); 4504 assert_different_registers(x, Dtmp0, Dtmp1, Ddelta); 4505 4506 vshr_64_u64(Dtmp0, x, sh1); 4507 vshl_64_64(Dtmp1, x, 64-sh1); 4508 vorr_64(Ddelta, Dtmp0, Dtmp1); 4509 4510 vshr_64_u64(Dtmp0, x, sh2); 4511 vshl_64_64(Dtmp1, x, 64-sh2); 4512 vorr_64(Dtmp0, Dtmp0, Dtmp1); 4513 4514 veor_64(Ddelta, Ddelta, Dtmp0); 4515 4516 vshr_64_u64(Dtmp0, x, sh3); 4517 4518 veor_64(Ddelta, Ddelta, Dtmp0); 4519 } 4520 4521 /** 4522 * SHA512 Ch 4523 * Ch(x, y, z) = (x AND y) XOR ( NOT x AND z) 4524 */ 4525 void MacroAssembler::sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z, 4526 FloatRegister Dtmp, FloatRegister Dch) { 4527 assert_different_registers(x, Dtmp, Dch); 4528 4529 vmvn_64(Dtmp, x); 4530 vand_64(Dtmp, Dtmp, z); 4531 4532 vand_64(Dch, x, y); 4533 veor_64(Dch, Dtmp, Dch); 4534 } 4535 4536 /** 4537 * SHA512 Maj 4538 * Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z) 4539 */ 4540 void MacroAssembler::sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z, 4541 FloatRegister Dtmp, FloatRegister Dmaj) { 4542 assert_different_registers(x, Dtmp, Dmaj); 4543 4544 vand_64(Dmaj, x, y); 4545 vand_64(Dtmp, x, z); 4546 veor_64(Dmaj, Dmaj, Dtmp); 4547 vand_64(Dtmp, y, z); 4548 veor_64(Dmaj, Dmaj, Dtmp); 4549 } 4550 4551 /** 4552 * SHA512 digest 4553 * 4554 * @param from register pointing to source array address 4555 * @param state register pointing to state array address 4556 */ 4557 void MacroAssembler::kernel_sha512_implCompress(Register from, Register state, 4558 Register counter, Register table_k) { 4559 Label L_hash_loop, L_hash_no_w; 4560 FloatRegister st_a = d18; //q9-q12 4561 FloatRegister st_b = st_a->successor(FloatRegisterImpl::DOUBLE); 4562 FloatRegister st_c = st_b->successor(FloatRegisterImpl::DOUBLE); 4563 FloatRegister st_d = st_c->successor(FloatRegisterImpl::DOUBLE); 4564 FloatRegister st_e = st_d->successor(FloatRegisterImpl::DOUBLE); 4565 FloatRegister st_f = st_e->successor(FloatRegisterImpl::DOUBLE); 4566 FloatRegister st_g = st_f->successor(FloatRegisterImpl::DOUBLE); 4567 FloatRegister st_h = st_g->successor(FloatRegisterImpl::DOUBLE); 4568 4569 FloatRegister w16 = d0; //q0-q7 4570 FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE); 4571 FloatRegister w14 = w15->successor(FloatRegisterImpl::DOUBLE); 4572 FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE); 4573 FloatRegister w12 = w13->successor(FloatRegisterImpl::DOUBLE); 4574 FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE); 4575 FloatRegister w10 = w11->successor(FloatRegisterImpl::DOUBLE); 4576 FloatRegister w9 = w10->successor(FloatRegisterImpl::DOUBLE); 4577 FloatRegister w8 = w9->successor(FloatRegisterImpl::DOUBLE); 4578 FloatRegister w7 = w8->successor(FloatRegisterImpl::DOUBLE); 4579 FloatRegister w6 = w7->successor(FloatRegisterImpl::DOUBLE); 4580 FloatRegister w5 = w6->successor(FloatRegisterImpl::DOUBLE); 4581 FloatRegister w4 = w5->successor(FloatRegisterImpl::DOUBLE); 4582 FloatRegister w3 = w4->successor(FloatRegisterImpl::DOUBLE); 4583 FloatRegister w2 = w3->successor(FloatRegisterImpl::DOUBLE); 4584 FloatRegister w1 = w2->successor(FloatRegisterImpl::DOUBLE); 4585 4586 FloatRegister t1 = d26; 4587 FloatRegister t2 = d27; 4588 FloatRegister new_a = st_h; 4589 FloatRegister new_e = st_d; 4590 FloatRegister new_new_a = st_g; 4591 FloatRegister new_new_e = st_c; 4592 4593 FloatRegister w0 = w1->successor(FloatRegisterImpl::DOUBLE); 4594 assert_different_registers(st_a,st_b,st_c,st_d,st_e,st_f,st_g,st_h); 4595 assert_different_registers(w0,w1,w2,w3,w4,w5,w6,w7); 4596 assert_different_registers(w8,w9,w10,w11,w12,w13,w14,w15,w16); 4597 4598 lea(table_k, ExternalAddress(StubRoutines::sha512_table_addr())); 4599 4600 // read initial 16 W elements 4601 vld1_64(w16, w15, w14, w13, Address(post(from, 32)), Assembler::ALIGN_STD); 4602 vld1_64(w12, w11, w10, w9, Address(post(from, 32)), Assembler::ALIGN_STD); 4603 vld1_64(w8, w7, w6, w5, Address(post(from, 32)), Assembler::ALIGN_STD); 4604 vld1_64(w4, w3, w2, w1, Address(from), Assembler::ALIGN_STD); 4605 // read initial state to a,b,c,d,e,f,g,h 4606 vld1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD); 4607 vld1_64(st_e, st_f, st_g, st_h, Address(state), Assembler::ALIGN_STD); 4608 sub(state, state, 32); 4609 4610 // revert W 4611 vrev64_128_8(w16, w16); 4612 vrev64_128_8(w14, w14); 4613 vrev64_128_8(w12, w12); 4614 vrev64_128_8(w10, w10); 4615 vrev64_128_8(w8, w8); 4616 vrev64_128_8(w6, w6); 4617 vrev64_128_8(w4, w4); 4618 vrev64_128_8(w2, w2); 4619 4620 4621 mov(counter, 40); 4622 BIND(L_hash_loop); { 4623 sub(counter, counter, 1); 4624 // first iteration 4625 // calculate T1 4626 // read K 4627 vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64); 4628 vadd_64_64(d31, st_h, w16); 4629 sha512_ch(st_e, st_f, st_g, t2, t1); 4630 sha512_sigma(st_e, q14, t2, 14, 18, 41); 4631 vadd_128_64(q13, q13, q15); 4632 vadd_64_64(t1, t1, t2); 4633 4634 // calculate T2 4635 sha512_maj(st_a, st_b, st_c, d30, d31); 4636 sha512_sigma(st_a, q14, t2, 28, 34, 39); 4637 vadd_64_64(t2, t2, d31); 4638 4639 vadd_64_64(new_a, t1, t2); 4640 vadd_64_64(new_e, st_d, t1); 4641 4642 // second iteration 4643 // calculate T1 4644 // read K 4645 vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64); 4646 vadd_64_64(d31, st_g, w15); 4647 sha512_ch(new_e, st_e, st_f, t2, t1); 4648 sha512_sigma(new_e, q14, t2, 14, 18, 41); 4649 vadd_128_64(q13, q13, q15); 4650 vadd_64_64(t1, t1, t2); 4651 4652 // calculate T2 4653 sha512_maj(new_a, st_a, st_b, d30, d31); 4654 sha512_sigma(new_a, q14, t2, 28, 34, 39); 4655 vadd_64_64(t2, t2, d31); 4656 4657 vadd_64_64(new_new_a, t1, t2); 4658 vadd_64_64(new_new_e, st_c, t1); 4659 4660 // restore a,b,c,d,e,f,g,h sequence 4661 vswp_128(st_g, st_a); 4662 vswp_128(st_g, st_c); 4663 vswp_128(st_g, st_e); 4664 4665 cmp(counter, 8); 4666 b(L_hash_no_w, Assembler::LO); 4667 4668 // calculate W[+1], W[+2] 4669 sha512_delta(w15, q14, t1, 1, 8, 7); 4670 sha512_delta(w2, q14, d30, 19, 61, 6); 4671 sha512_delta(w14, q14, t2, 1, 8, 7); 4672 sha512_delta(w1, q14, d31, 19, 61, 6); 4673 4674 vadd_128_64(w16, w16, t1); 4675 vadd_128_64(w16, w16, q15); 4676 vadd_64_64(w16, w16, w7); 4677 vadd_64_64(w15, w15, w6); 4678 4679 BIND(L_hash_no_w); 4680 4681 vswp_128(w16, w14); 4682 vswp_128(w14, w12); 4683 vswp_128(w12, w10); 4684 vswp_128(w10, w8); 4685 vswp_128(w8, w6); 4686 vswp_128(w6, w4); 4687 vswp_128(w4, w2); 4688 } cbnz(counter, L_hash_loop); 4689 // read initial state to w16 - w9 4690 vld1_64(w16, w15, w14, w13, Address(post(state, 32)), Assembler::ALIGN_STD); 4691 vld1_64(w12, w11, w10, w9, Address(state), Assembler::ALIGN_STD); 4692 sub(state, state, 32); 4693 4694 // update state 4695 vadd_128_64(st_a, st_a, w16); 4696 vadd_128_64(st_c, st_c, w14); 4697 vadd_128_64(st_e, st_e, w12); 4698 vadd_128_64(st_g, st_g, w10); 4699 4700 // store state 4701 vst1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD); 4702 vst1_64(st_e, st_f, st_g, st_h, Address(state), Assembler::ALIGN_STD); 4703 } 4704 4705 void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) { 4706 if (width > 15 && lsb == 0) { 4707 lsr(Rd, Rd, width); 4708 lsl(Rd, Rd, width); 4709 } else if (width > 15 && lsb + width == 32) { 4710 lsl(Rd, Rd, 32 - lsb); 4711 lsr(Rd, Rd, 32 - lsb); 4712 } else { 4713 const int lsb1 = (lsb & 1); 4714 int w1 = width <= 8 - lsb1 ? width : 8 - lsb1; 4715 while (width) { 4716 bic(Rd, Rd, ((1 << w1) - 1) << lsb); 4717 width -= w1; 4718 lsb += w1; 4719 w1 = width > 8 ? 8 : width; 4720 } 4721 } 4722 } 4723 4724 // get_thread can be called anywhere inside generated code so we need 4725 // to save whatever non-callee save context might get clobbered by the 4726 // call to the C thread_local lookup call or, indeed, the call setup 4727 // code. x86 appears to save C arg registers. 4728 4729 void MacroAssembler::get_thread(Register dst) { 4730 // call pthread_getspecific 4731 // void * pthread_getspecific(pthread_key_t key); 4732 4733 // Save all call-clobbered regs except dst, plus rscratch1 and rscratch2. 4734 RegSet saved_regs = RegSet::range(r0, r3) + rscratch1 + rscratch2 + lr - dst; 4735 push(saved_regs, sp); 4736 4737 // Align stack and save value for return 4738 mov(c_rarg1, sp); 4739 sub(sp, sp, wordSize); 4740 bic(sp, sp, 7); 4741 str(c_rarg1, Address(sp)); 4742 4743 mov(rscratch2, CAST_FROM_FN_PTR(address, Thread::current)); 4744 4745 bl(rscratch2); 4746 //undo alignment 4747 ldr(sp, Address(sp)); 4748 4749 if (dst != c_rarg0) { 4750 mov(dst, c_rarg0); 4751 } 4752 4753 // restore pushed registers 4754 pop(saved_regs, sp); 4755 } 4756 4757 #ifdef COMPILER2 4758 // 24-bit word range == 26-bit byte range 4759 bool check26(int offset) { 4760 // this could be simplified, but it mimics encoding and decoding 4761 // an actual branch insrtuction 4762 int off1 = offset << 6 >> 8; 4763 int encoded = off1 & ((1<<24)-1); 4764 int decoded = encoded << 8 >> 6; 4765 return offset == decoded; 4766 } 4767 4768 // Perform some slight adjustments so the default 32MB code cache 4769 // is fully reachable. 4770 static inline address first_cache_address() { 4771 return CodeCache::low_bound() + sizeof(HeapBlock::Header); 4772 } 4773 static inline address last_cache_address() { 4774 return CodeCache::high_bound() - NativeInstruction::arm_insn_sz; 4775 } 4776 4777 // Can we reach target using unconditional branch or call from anywhere 4778 // in the code cache (because code can be relocated)? 4779 bool MacroAssembler::_reachable_from_cache(address target) { 4780 #ifdef __thumb__ 4781 if ((1 & (intptr_t)target) != 0) { 4782 // Return false to avoid 'b' if we need switching to THUMB mode. 4783 return false; 4784 } 4785 #endif 4786 4787 address cl = first_cache_address(); 4788 address ch = last_cache_address(); 4789 4790 if (ForceUnreachable) { 4791 // Only addresses from CodeCache can be treated as reachable. 4792 if (target < CodeCache::low_bound() || CodeCache::high_bound() <= target) { 4793 return false; 4794 } 4795 } 4796 4797 intptr_t loffset = (intptr_t)target - (intptr_t)cl; 4798 intptr_t hoffset = (intptr_t)target - (intptr_t)ch; 4799 4800 return check26(loffset - 8) && check26(hoffset - 8); 4801 } 4802 4803 bool MacroAssembler::_cache_fully_reachable() { 4804 address cl = first_cache_address(); 4805 address ch = last_cache_address(); 4806 return _reachable_from_cache(cl) && _reachable_from_cache(ch); 4807 } 4808 4809 bool MacroAssembler::reachable_from_cache(address target) { 4810 assert(CodeCache::contains(pc()), "not supported"); 4811 return _reachable_from_cache(target); 4812 } 4813 4814 bool MacroAssembler::cache_fully_reachable() { 4815 return _cache_fully_reachable(); 4816 } 4817 4818 // IMPORTANT: does not generate mt-safe patchable code 4819 void MacroAssembler::call(address target, RelocationHolder rspec, Condition cond) { 4820 Register scratch = lr; 4821 assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported"); 4822 if (reachable_from_cache(target)) { 4823 relocate(rspec); 4824 bl(target, cond); 4825 return; 4826 } 4827 4828 mov(scratch, (intptr_t)target, cond); 4829 bl(scratch, cond); 4830 } 4831 4832 // IMPORTANT: does not generate mt-safe patchable code. C2 only uses this method 4833 // for calls into runtime which do not need mt-safe patching 4834 void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch, Condition cond) { 4835 assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported"); 4836 if (reachable_from_cache(target)) { 4837 relocate(rtype); 4838 b(target, cond); 4839 return; 4840 } 4841 4842 mov(scratch, (intptr_t)target, cond); 4843 b(scratch, cond); 4844 } 4845 4846 void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) { 4847 // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM 4848 if (UseStackBanging) { 4849 const int page_size = os::vm_page_size(); 4850 4851 sub(tmp, sp, StackShadowPages*page_size); 4852 strb(r0, Address(tmp)); 4853 for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) { 4854 strb(r0, pre(tmp, -0xff0)); 4855 } 4856 } 4857 } 4858 4859 void MacroAssembler::floating_cmp(Register dst) { 4860 vmrs(dst); 4861 orr(dst, dst, 0x08000000); 4862 eor(dst, dst, dst, lsl(3)); 4863 mov(dst, dst, asr(30)); 4864 } 4865 4866 void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2) { 4867 assert(Roop != Rscratch, ""); 4868 assert(Roop != Rmark, ""); 4869 assert(Rbox != Rscratch, ""); 4870 assert(Rbox != Rmark, ""); 4871 4872 Label fast_lock, done; 4873 4874 if (UseBiasedLocking && !UseOptoBiasInlining) { 4875 Label failed; 4876 biased_locking_enter(Roop, Rmark, Rscratch, Rscratch2, false, done, &failed); 4877 bind(failed); 4878 } 4879 4880 ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes())); 4881 tst(Rmark, markOopDesc::unlocked_value); 4882 b(fast_lock, Assembler::NE); 4883 4884 // Check for recursive lock 4885 // See comments in InterpreterMacroAssembler::lock_object for 4886 // explanations on the fast recursive locking check. 4887 // -1- test low 2 bits 4888 movs(Rscratch, Rmark, lsl(30)); 4889 // -2- test (hdr - SP) if the low two bits are 0 4890 sub(Rscratch, Rmark, sp, Assembler::EQ); 4891 movs(Rscratch, Rscratch, lsr(exact_log2(os::vm_page_size())), Assembler::EQ); 4892 // If still 'eq' then recursive locking OK 4893 // set to zero if recursive lock, set to non zero otherwise (see discussion in JDK-8153107) 4894 str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); 4895 b(done); 4896 4897 bind(fast_lock); 4898 str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); 4899 4900 membar(StoreStore); 4901 ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes())); 4902 cmp(Rscratch, Rmark); 4903 strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ); 4904 cmp(Rscratch, 0, Assembler::EQ); 4905 membar(AnyAny); 4906 4907 bind(done); 4908 } 4909 4910 void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2) { 4911 Register Rmark = Rscratch2; 4912 4913 assert(Roop != Rscratch, ""); 4914 assert(Roop != Rmark, ""); 4915 assert(Rbox != Rscratch, ""); 4916 assert(Rbox != Rmark, ""); 4917 4918 Label done; 4919 4920 if (UseBiasedLocking && !UseOptoBiasInlining) { 4921 biased_locking_exit(Roop, Rscratch, done); 4922 } 4923 4924 ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); 4925 // If hdr is NULL, we've got recursive locking and there's nothing more to do 4926 cmp(Rmark, 0); 4927 b(done, Assembler::EQ); 4928 4929 // Restore the object header 4930 membar(AnyAny); 4931 ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes())); 4932 cmp(Rscratch, Rmark); 4933 strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ); 4934 cmp(Rscratch, 0, Assembler::EQ); 4935 4936 membar(StoreLoad); 4937 4938 bind(done); 4939 } 4940 4941 #endif