1 /*
   2  * Copyright (c) 2008, 2017, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "asm/assembler.inline.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "ci/ciEnv.hpp"
  30 #include "code/nativeInst.hpp"
  31 #include "compiler/disassembler.hpp"
  32 #include "gc/shared/cardTableModRefBS.hpp"
  33 #include "gc/shared/collectedHeap.inline.hpp"
  34 #include "interpreter/interpreter.hpp"
  35 #include "memory/resourceArea.hpp"
  36 #include "oops/klass.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/biasedLocking.hpp"
  39 #include "runtime/interfaceSupport.hpp"
  40 #include "runtime/objectMonitor.hpp"
  41 #include "runtime/os.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "utilities/macros.hpp"
  45 #if INCLUDE_ALL_GCS
  46 #include "gc/g1/g1CollectedHeap.inline.hpp"
  47 #include "gc/g1/g1SATBCardTableModRefBS.hpp"
  48 #include "gc/g1/heapRegion.hpp"
  49 #endif
  50 
  51 // Implementation of AddressLiteral
  52 
  53 void AddressLiteral::set_rspec(relocInfo::relocType rtype) {
  54   switch (rtype) {
  55   case relocInfo::oop_type:
  56     // Oops are a special case. Normally they would be their own section
  57     // but in cases like icBuffer they are literals in the code stream that
  58     // we don't have a section for. We use none so that we get a literal address
  59     // which is always patchable.
  60     break;
  61   case relocInfo::external_word_type:
  62     _rspec = external_word_Relocation::spec(_target);
  63     break;
  64   case relocInfo::internal_word_type:
  65     _rspec = internal_word_Relocation::spec(_target);
  66     break;
  67   case relocInfo::opt_virtual_call_type:
  68     _rspec = opt_virtual_call_Relocation::spec();
  69     break;
  70   case relocInfo::static_call_type:
  71     _rspec = static_call_Relocation::spec();
  72     break;
  73   case relocInfo::runtime_call_type:
  74     _rspec = runtime_call_Relocation::spec();
  75     break;
  76   case relocInfo::poll_type:
  77   case relocInfo::poll_return_type:
  78     _rspec = Relocation::spec_simple(rtype);
  79     break;
  80   case relocInfo::none:
  81     break;
  82   default:
  83     ShouldNotReachHere();
  84     break;
  85   }
  86 }
  87 
  88 // Initially added to the Assembler interface as a pure virtual:
  89 //   RegisterConstant delayed_value(..)
  90 // for:
  91 //   6812678 macro assembler needs delayed binding of a few constants (for 6655638)
  92 // this was subsequently modified to its present name and return type
  93 RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
  94                                                       Register tmp,
  95                                                       int offset) {
  96   ShouldNotReachHere();
  97   return RegisterOrConstant(-1);
  98 }
  99 
 100 
 101 #ifdef AARCH64
 102 // Note: ARM32 version is OS dependent
 103 void MacroAssembler::breakpoint(AsmCondition cond) {
 104   if (cond == al) {
 105     brk();
 106   } else {
 107     Label L;
 108     b(L, inverse(cond));
 109     brk();
 110     bind(L);
 111   }
 112 }
 113 #endif // AARCH64
 114 
 115 
 116 // virtual method calling
 117 void MacroAssembler::lookup_virtual_method(Register recv_klass,
 118                                            Register vtable_index,
 119                                            Register method_result) {
 120   const int base_offset = in_bytes(Klass::vtable_start_offset()) + vtableEntry::method_offset_in_bytes();
 121   assert(vtableEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
 122   add(recv_klass, recv_klass, AsmOperand(vtable_index, lsl, LogBytesPerWord));
 123   ldr(method_result, Address(recv_klass, base_offset));
 124 }
 125 
 126 
 127 // Simplified, combined version, good for typical uses.
 128 // Falls through on failure.
 129 void MacroAssembler::check_klass_subtype(Register sub_klass,
 130                                          Register super_klass,
 131                                          Register temp_reg,
 132                                          Register temp_reg2,
 133                                          Register temp_reg3,
 134                                          Label& L_success) {
 135   Label L_failure;
 136   check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg, temp_reg2, &L_success, &L_failure, NULL);
 137   check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, temp_reg2, temp_reg3, &L_success, NULL);
 138   bind(L_failure);
 139 };
 140 
 141 void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
 142                                                    Register super_klass,
 143                                                    Register temp_reg,
 144                                                    Register temp_reg2,
 145                                                    Label* L_success,
 146                                                    Label* L_failure,
 147                                                    Label* L_slow_path) {
 148 
 149   assert_different_registers(sub_klass, super_klass, temp_reg, temp_reg2, noreg);
 150   const Register super_check_offset = temp_reg2;
 151 
 152   Label L_fallthrough;
 153   int label_nulls = 0;
 154   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 155   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 156   if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
 157   assert(label_nulls <= 1, "at most one NULL in the batch");
 158 
 159   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 160   int sco_offset = in_bytes(Klass::super_check_offset_offset());
 161   Address super_check_offset_addr(super_klass, sco_offset);
 162 
 163   // If the pointers are equal, we are done (e.g., String[] elements).
 164   // This self-check enables sharing of secondary supertype arrays among
 165   // non-primary types such as array-of-interface.  Otherwise, each such
 166   // type would need its own customized SSA.
 167   // We move this check to the front of the fast path because many
 168   // type checks are in fact trivially successful in this manner,
 169   // so we get a nicely predicted branch right at the start of the check.
 170   cmp(sub_klass, super_klass);
 171   b(*L_success, eq);
 172 
 173   // Check the supertype display:
 174   ldr_u32(super_check_offset, super_check_offset_addr);
 175 
 176   Address super_check_addr(sub_klass, super_check_offset);
 177   ldr(temp_reg, super_check_addr);
 178   cmp(super_klass, temp_reg); // load displayed supertype
 179 
 180   // This check has worked decisively for primary supers.
 181   // Secondary supers are sought in the super_cache ('super_cache_addr').
 182   // (Secondary supers are interfaces and very deeply nested subtypes.)
 183   // This works in the same check above because of a tricky aliasing
 184   // between the super_cache and the primary super display elements.
 185   // (The 'super_check_addr' can address either, as the case requires.)
 186   // Note that the cache is updated below if it does not help us find
 187   // what we need immediately.
 188   // So if it was a primary super, we can just fail immediately.
 189   // Otherwise, it's the slow path for us (no success at this point).
 190 
 191   b(*L_success, eq);
 192   cmp_32(super_check_offset, sc_offset);
 193   if (L_failure == &L_fallthrough) {
 194     b(*L_slow_path, eq);
 195   } else {
 196     b(*L_failure, ne);
 197     if (L_slow_path != &L_fallthrough) {
 198       b(*L_slow_path);
 199     }
 200   }
 201 
 202   bind(L_fallthrough);
 203 }
 204 
 205 
 206 void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
 207                                                    Register super_klass,
 208                                                    Register temp_reg,
 209                                                    Register temp2_reg,
 210                                                    Register temp3_reg,
 211                                                    Label* L_success,
 212                                                    Label* L_failure,
 213                                                    bool set_cond_codes) {
 214 #ifdef AARCH64
 215   NOT_IMPLEMENTED();
 216 #else
 217   // Note: if used by code that expects a register to be 0 on success,
 218   // this register must be temp_reg and set_cond_codes must be true
 219 
 220   Register saved_reg = noreg;
 221 
 222   // get additional tmp registers
 223   if (temp3_reg == noreg) {
 224     saved_reg = temp3_reg = LR;
 225     push(saved_reg);
 226   }
 227 
 228   assert(temp2_reg != noreg, "need all the temporary registers");
 229   assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, temp3_reg);
 230 
 231   Register cmp_temp = temp_reg;
 232   Register scan_temp = temp3_reg;
 233   Register count_temp = temp2_reg;
 234 
 235   Label L_fallthrough;
 236   int label_nulls = 0;
 237   if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
 238   if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
 239   assert(label_nulls <= 1, "at most one NULL in the batch");
 240 
 241   // a couple of useful fields in sub_klass:
 242   int ss_offset = in_bytes(Klass::secondary_supers_offset());
 243   int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 244   Address secondary_supers_addr(sub_klass, ss_offset);
 245   Address super_cache_addr(     sub_klass, sc_offset);
 246 
 247 #ifndef PRODUCT
 248   inc_counter((address)&SharedRuntime::_partial_subtype_ctr, scan_temp, count_temp);
 249 #endif
 250 
 251   // We will consult the secondary-super array.
 252   ldr(scan_temp, Address(sub_klass, ss_offset));
 253 
 254   assert(! UseCompressedOops, "search_key must be the compressed super_klass");
 255   // else search_key is the
 256   Register search_key = super_klass;
 257 
 258   // Load the array length.
 259   ldr(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
 260   add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
 261 
 262   add(count_temp, count_temp, 1);
 263 
 264   Label L_loop, L_setnz_and_fail, L_fail;
 265 
 266   // Top of search loop
 267   bind(L_loop);
 268   // Notes:
 269   //  scan_temp starts at the array elements
 270   //  count_temp is 1+size
 271   subs(count_temp, count_temp, 1);
 272   if ((L_failure != &L_fallthrough) && (! set_cond_codes) && (saved_reg == noreg)) {
 273     // direct jump to L_failure if failed and no cleanup needed
 274     b(*L_failure, eq); // not found and
 275   } else {
 276     b(L_fail, eq); // not found in the array
 277   }
 278 
 279   // Load next super to check
 280   // In the array of super classes elements are pointer sized.
 281   int element_size = wordSize;
 282   ldr(cmp_temp, Address(scan_temp, element_size, post_indexed));
 283 
 284   // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
 285   subs(cmp_temp, cmp_temp, search_key);
 286 
 287   // A miss means we are NOT a subtype and need to keep looping
 288   b(L_loop, ne);
 289 
 290   // Falling out the bottom means we found a hit; we ARE a subtype
 291 
 292   // Note: temp_reg/cmp_temp is already 0 and flag Z is set
 293 
 294   // Success.  Cache the super we found and proceed in triumph.
 295   str(super_klass, Address(sub_klass, sc_offset));
 296 
 297   if (saved_reg != noreg) {
 298     // Return success
 299     pop(saved_reg);
 300   }
 301 
 302   b(*L_success);
 303 
 304   bind(L_fail);
 305   // Note1: check "b(*L_failure, eq)" above if adding extra instructions here
 306   if (set_cond_codes) {
 307     movs(temp_reg, sub_klass); // clears Z and sets temp_reg to non-0 if needed
 308   }
 309   if (saved_reg != noreg) {
 310     pop(saved_reg);
 311   }
 312   if (L_failure != &L_fallthrough) {
 313     b(*L_failure);
 314   }
 315 
 316   bind(L_fallthrough);
 317 #endif
 318 }
 319 
 320 // Returns address of receiver parameter, using tmp as base register. tmp and params_count can be the same.
 321 Address MacroAssembler::receiver_argument_address(Register params_base, Register params_count, Register tmp) {
 322   assert_different_registers(params_base, params_count);
 323   add(tmp, params_base, AsmOperand(params_count, lsl, Interpreter::logStackElementSize));
 324   return Address(tmp, -Interpreter::stackElementSize);
 325 }
 326 
 327 
 328 void MacroAssembler::align(int modulus) {
 329   while (offset() % modulus != 0) {
 330     nop();
 331   }
 332 }
 333 
 334 int MacroAssembler::set_last_Java_frame(Register last_java_sp,
 335                                         Register last_java_fp,
 336                                         bool save_last_java_pc,
 337                                         Register tmp) {
 338   int pc_offset;
 339   if (last_java_fp != noreg) {
 340     // optional
 341     str(last_java_fp, Address(Rthread, JavaThread::last_Java_fp_offset()));
 342     _fp_saved = true;
 343   } else {
 344     _fp_saved = false;
 345   }
 346   if (AARCH64_ONLY(true) NOT_AARCH64(save_last_java_pc)) { // optional on 32-bit ARM
 347 #ifdef AARCH64
 348     pc_offset = mov_pc_to(tmp);
 349     str(tmp, Address(Rthread, JavaThread::last_Java_pc_offset()));
 350 #else
 351     str(PC, Address(Rthread, JavaThread::last_Java_pc_offset()));
 352     pc_offset = offset() + VM_Version::stored_pc_adjustment();
 353 #endif
 354     _pc_saved = true;
 355   } else {
 356     _pc_saved = false;
 357     pc_offset = -1;
 358   }
 359   // According to comment in javaFrameAnchorm SP must be saved last, so that other
 360   // entries are valid when SP is set.
 361 
 362   // However, this is probably not a strong constrainst since for instance PC is
 363   // sometimes read from the stack at SP... but is pushed later (by the call). Hence,
 364   // we now write the fields in the expected order but we have not added a StoreStore
 365   // barrier.
 366 
 367   // XXX: if the ordering is really important, PC should always be saved (without forgetting
 368   // to update oop_map offsets) and a StoreStore barrier might be needed.
 369 
 370   if (last_java_sp == noreg) {
 371     last_java_sp = SP; // always saved
 372   }
 373 #ifdef AARCH64
 374   if (last_java_sp == SP) {
 375     mov(tmp, SP);
 376     str(tmp, Address(Rthread, JavaThread::last_Java_sp_offset()));
 377   } else {
 378     str(last_java_sp, Address(Rthread, JavaThread::last_Java_sp_offset()));
 379   }
 380 #else
 381   str(last_java_sp, Address(Rthread, JavaThread::last_Java_sp_offset()));
 382 #endif
 383 
 384   return pc_offset; // for oopmaps
 385 }
 386 
 387 void MacroAssembler::reset_last_Java_frame(Register tmp) {
 388   const Register Rzero = zero_register(tmp);
 389   str(Rzero, Address(Rthread, JavaThread::last_Java_sp_offset()));
 390   if (_fp_saved) {
 391     str(Rzero, Address(Rthread, JavaThread::last_Java_fp_offset()));
 392   }
 393   if (_pc_saved) {
 394     str(Rzero, Address(Rthread, JavaThread::last_Java_pc_offset()));
 395   }
 396 }
 397 
 398 
 399 // Implementation of call_VM versions
 400 
 401 void MacroAssembler::call_VM_leaf_helper(address entry_point, int number_of_arguments) {
 402   assert(number_of_arguments >= 0, "cannot have negative number of arguments");
 403   assert(number_of_arguments <= 4, "cannot have more than 4 arguments");
 404 
 405 #ifndef AARCH64
 406   // Safer to save R9 here since callers may have been written
 407   // assuming R9 survives. This is suboptimal but is not worth
 408   // optimizing for the few platforms where R9 is scratched.
 409   push(RegisterSet(R4) | R9ifScratched);
 410   mov(R4, SP);
 411   bic(SP, SP, StackAlignmentInBytes - 1);
 412 #endif // AARCH64
 413   call(entry_point, relocInfo::runtime_call_type);
 414 #ifndef AARCH64
 415   mov(SP, R4);
 416   pop(RegisterSet(R4) | R9ifScratched);
 417 #endif // AARCH64
 418 }
 419 
 420 
 421 void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
 422   assert(number_of_arguments >= 0, "cannot have negative number of arguments");
 423   assert(number_of_arguments <= 3, "cannot have more than 3 arguments");
 424 
 425   const Register tmp = Rtemp;
 426   assert_different_registers(oop_result, tmp);
 427 
 428   set_last_Java_frame(SP, FP, true, tmp);
 429 
 430 #ifdef ASSERT
 431   AARCH64_ONLY(if (UseCompressedOops || UseCompressedClassPointers) { verify_heapbase("call_VM_helper: heap base corrupted?"); });
 432 #endif // ASSERT
 433 
 434 #ifndef AARCH64
 435 #if R9_IS_SCRATCHED
 436   // Safer to save R9 here since callers may have been written
 437   // assuming R9 survives. This is suboptimal but is not worth
 438   // optimizing for the few platforms where R9 is scratched.
 439 
 440   // Note: cannot save R9 above the saved SP (some calls expect for
 441   // instance the Java stack top at the saved SP)
 442   // => once saved (with set_last_Java_frame), decrease SP before rounding to
 443   // ensure the slot at SP will be free for R9).
 444   sub(SP, SP, 4);
 445   bic(SP, SP, StackAlignmentInBytes - 1);
 446   str(R9, Address(SP, 0));
 447 #else
 448   bic(SP, SP, StackAlignmentInBytes - 1);
 449 #endif // R9_IS_SCRATCHED
 450 #endif
 451 
 452   mov(R0, Rthread);
 453   call(entry_point, relocInfo::runtime_call_type);
 454 
 455 #ifndef AARCH64
 456 #if R9_IS_SCRATCHED
 457   ldr(R9, Address(SP, 0));
 458 #endif
 459   ldr(SP, Address(Rthread, JavaThread::last_Java_sp_offset()));
 460 #endif
 461 
 462   reset_last_Java_frame(tmp);
 463 
 464   // C++ interp handles this in the interpreter
 465   check_and_handle_popframe();
 466   check_and_handle_earlyret();
 467 
 468   if (check_exceptions) {
 469     // check for pending exceptions
 470     ldr(tmp, Address(Rthread, Thread::pending_exception_offset()));
 471 #ifdef AARCH64
 472     Label L;
 473     cbz(tmp, L);
 474     mov_pc_to(Rexception_pc);
 475     b(StubRoutines::forward_exception_entry());
 476     bind(L);
 477 #else
 478     cmp(tmp, 0);
 479     mov(Rexception_pc, PC, ne);
 480     b(StubRoutines::forward_exception_entry(), ne);
 481 #endif // AARCH64
 482   }
 483 
 484   // get oop result if there is one and reset the value in the thread
 485   if (oop_result->is_valid()) {
 486     get_vm_result(oop_result, tmp);
 487   }
 488 }
 489 
 490 void MacroAssembler::call_VM(Register oop_result, address entry_point, bool check_exceptions) {
 491   call_VM_helper(oop_result, entry_point, 0, check_exceptions);
 492 }
 493 
 494 
 495 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, bool check_exceptions) {
 496   assert (arg_1 == R1, "fixed register for arg_1");
 497   call_VM_helper(oop_result, entry_point, 1, check_exceptions);
 498 }
 499 
 500 
 501 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, bool check_exceptions) {
 502   assert (arg_1 == R1, "fixed register for arg_1");
 503   assert (arg_2 == R2, "fixed register for arg_2");
 504   call_VM_helper(oop_result, entry_point, 2, check_exceptions);
 505 }
 506 
 507 
 508 void MacroAssembler::call_VM(Register oop_result, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions) {
 509   assert (arg_1 == R1, "fixed register for arg_1");
 510   assert (arg_2 == R2, "fixed register for arg_2");
 511   assert (arg_3 == R3, "fixed register for arg_3");
 512   call_VM_helper(oop_result, entry_point, 3, check_exceptions);
 513 }
 514 
 515 
 516 void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments, bool check_exceptions) {
 517   // Not used on ARM
 518   Unimplemented();
 519 }
 520 
 521 
 522 void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions) {
 523   // Not used on ARM
 524   Unimplemented();
 525 }
 526 
 527 
 528 void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions) {
 529 // Not used on ARM
 530   Unimplemented();
 531 }
 532 
 533 
 534 void MacroAssembler::call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions) {
 535   // Not used on ARM
 536   Unimplemented();
 537 }
 538 
 539 // Raw call, without saving/restoring registers, exception handling, etc.
 540 // Mainly used from various stubs.
 541 void MacroAssembler::call_VM(address entry_point, bool save_R9_if_scratched) {
 542   const Register tmp = Rtemp; // Rtemp free since scratched by call
 543   set_last_Java_frame(SP, FP, true, tmp);
 544 #if R9_IS_SCRATCHED
 545   if (save_R9_if_scratched) {
 546     // Note: Saving also R10 for alignment.
 547     push(RegisterSet(R9, R10));
 548   }
 549 #endif
 550   mov(R0, Rthread);
 551   call(entry_point, relocInfo::runtime_call_type);
 552 #if R9_IS_SCRATCHED
 553   if (save_R9_if_scratched) {
 554     pop(RegisterSet(R9, R10));
 555   }
 556 #endif
 557   reset_last_Java_frame(tmp);
 558 }
 559 
 560 void MacroAssembler::call_VM_leaf(address entry_point) {
 561   call_VM_leaf_helper(entry_point, 0);
 562 }
 563 
 564 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1) {
 565   assert (arg_1 == R0, "fixed register for arg_1");
 566   call_VM_leaf_helper(entry_point, 1);
 567 }
 568 
 569 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2) {
 570   assert (arg_1 == R0, "fixed register for arg_1");
 571   assert (arg_2 == R1, "fixed register for arg_2");
 572   call_VM_leaf_helper(entry_point, 2);
 573 }
 574 
 575 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3) {
 576   assert (arg_1 == R0, "fixed register for arg_1");
 577   assert (arg_2 == R1, "fixed register for arg_2");
 578   assert (arg_3 == R2, "fixed register for arg_3");
 579   call_VM_leaf_helper(entry_point, 3);
 580 }
 581 
 582 void MacroAssembler::call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4) {
 583   assert (arg_1 == R0, "fixed register for arg_1");
 584   assert (arg_2 == R1, "fixed register for arg_2");
 585   assert (arg_3 == R2, "fixed register for arg_3");
 586   assert (arg_4 == R3, "fixed register for arg_4");
 587   call_VM_leaf_helper(entry_point, 4);
 588 }
 589 
 590 void MacroAssembler::get_vm_result(Register oop_result, Register tmp) {
 591   assert_different_registers(oop_result, tmp);
 592   ldr(oop_result, Address(Rthread, JavaThread::vm_result_offset()));
 593   str(zero_register(tmp), Address(Rthread, JavaThread::vm_result_offset()));
 594   verify_oop(oop_result);
 595 }
 596 
 597 void MacroAssembler::get_vm_result_2(Register metadata_result, Register tmp) {
 598   assert_different_registers(metadata_result, tmp);
 599   ldr(metadata_result, Address(Rthread, JavaThread::vm_result_2_offset()));
 600   str(zero_register(tmp), Address(Rthread, JavaThread::vm_result_2_offset()));
 601 }
 602 
 603 void MacroAssembler::add_rc(Register dst, Register arg1, RegisterOrConstant arg2) {
 604   if (arg2.is_register()) {
 605     add(dst, arg1, arg2.as_register());
 606   } else {
 607     add(dst, arg1, arg2.as_constant());
 608   }
 609 }
 610 
 611 void MacroAssembler::add_slow(Register rd, Register rn, int c) {
 612 #ifdef AARCH64
 613   if (c == 0) {
 614     if (rd != rn) {
 615       mov(rd, rn);
 616     }
 617     return;
 618   }
 619   if (c < 0) {
 620     sub_slow(rd, rn, -c);
 621     return;
 622   }
 623   if (c > right_n_bits(24)) {
 624     guarantee(rd != rn, "no large add_slow with only one register");
 625     mov_slow(rd, c);
 626     add(rd, rn, rd);
 627   } else {
 628     int lo = c & right_n_bits(12);
 629     int hi = (c >> 12) & right_n_bits(12);
 630     if (lo != 0) {
 631       add(rd, rn, lo, lsl0);
 632     }
 633     if (hi != 0) {
 634       add(rd, (lo == 0) ? rn : rd, hi, lsl12);
 635     }
 636   }
 637 #else
 638   // This function is used in compiler for handling large frame offsets
 639   if ((c < 0) && (((-c) & ~0x3fc) == 0)) {
 640     return sub(rd, rn, (-c));
 641   }
 642   int low = c & 0x3fc;
 643   if (low != 0) {
 644     add(rd, rn, low);
 645     rn = rd;
 646   }
 647   if (c & ~0x3fc) {
 648     assert(AsmOperand::is_rotated_imm(c & ~0x3fc), "unsupported add_slow offset %d", c);
 649     add(rd, rn, c & ~0x3fc);
 650   } else if (rd != rn) {
 651     assert(c == 0, "");
 652     mov(rd, rn); // need to generate at least one move!
 653   }
 654 #endif // AARCH64
 655 }
 656 
 657 void MacroAssembler::sub_slow(Register rd, Register rn, int c) {
 658 #ifdef AARCH64
 659   if (c <= 0) {
 660     add_slow(rd, rn, -c);
 661     return;
 662   }
 663   if (c > right_n_bits(24)) {
 664     guarantee(rd != rn, "no large sub_slow with only one register");
 665     mov_slow(rd, c);
 666     sub(rd, rn, rd);
 667   } else {
 668     int lo = c & right_n_bits(12);
 669     int hi = (c >> 12) & right_n_bits(12);
 670     if (lo != 0) {
 671       sub(rd, rn, lo, lsl0);
 672     }
 673     if (hi != 0) {
 674       sub(rd, (lo == 0) ? rn : rd, hi, lsl12);
 675     }
 676   }
 677 #else
 678   // This function is used in compiler for handling large frame offsets
 679   if ((c < 0) && (((-c) & ~0x3fc) == 0)) {
 680     return add(rd, rn, (-c));
 681   }
 682   int low = c & 0x3fc;
 683   if (low != 0) {
 684     sub(rd, rn, low);
 685     rn = rd;
 686   }
 687   if (c & ~0x3fc) {
 688     assert(AsmOperand::is_rotated_imm(c & ~0x3fc), "unsupported sub_slow offset %d", c);
 689     sub(rd, rn, c & ~0x3fc);
 690   } else if (rd != rn) {
 691     assert(c == 0, "");
 692     mov(rd, rn); // need to generate at least one move!
 693   }
 694 #endif // AARCH64
 695 }
 696 
 697 void MacroAssembler::mov_slow(Register rd, address addr) {
 698   // do *not* call the non relocated mov_related_address
 699   mov_slow(rd, (intptr_t)addr);
 700 }
 701 
 702 void MacroAssembler::mov_slow(Register rd, const char *str) {
 703   mov_slow(rd, (intptr_t)str);
 704 }
 705 
 706 #ifdef AARCH64
 707 
 708 // Common code for mov_slow and instr_count_for_mov_slow.
 709 // Returns number of instructions of mov_slow pattern,
 710 // generating it if non-null MacroAssembler is given.
 711 int MacroAssembler::mov_slow_helper(Register rd, intptr_t c, MacroAssembler* masm) {
 712   // This code pattern is matched in NativeIntruction::is_mov_slow.
 713   // Update it at modifications.
 714 
 715   const intx mask = right_n_bits(16);
 716   // 1 movz instruction
 717   for (int base_shift = 0; base_shift < 64; base_shift += 16) {
 718     if ((c & ~(mask << base_shift)) == 0) {
 719       if (masm != NULL) {
 720         masm->movz(rd, ((uintx)c) >> base_shift, base_shift);
 721       }
 722       return 1;
 723     }
 724   }
 725   // 1 movn instruction
 726   for (int base_shift = 0; base_shift < 64; base_shift += 16) {
 727     if (((~c) & ~(mask << base_shift)) == 0) {
 728       if (masm != NULL) {
 729         masm->movn(rd, ((uintx)(~c)) >> base_shift, base_shift);
 730       }
 731       return 1;
 732     }
 733   }
 734   // 1 orr instruction
 735   {
 736     LogicalImmediate imm(c, false);
 737     if (imm.is_encoded()) {
 738       if (masm != NULL) {
 739         masm->orr(rd, ZR, imm);
 740       }
 741       return 1;
 742     }
 743   }
 744   // 1 movz/movn + up to 3 movk instructions
 745   int zeroes = 0;
 746   int ones = 0;
 747   for (int base_shift = 0; base_shift < 64; base_shift += 16) {
 748     int part = (c >> base_shift) & mask;
 749     if (part == 0) {
 750       ++zeroes;
 751     } else if (part == mask) {
 752       ++ones;
 753     }
 754   }
 755   int def_bits = 0;
 756   if (ones > zeroes) {
 757     def_bits = mask;
 758   }
 759   int inst_count = 0;
 760   for (int base_shift = 0; base_shift < 64; base_shift += 16) {
 761     int part = (c >> base_shift) & mask;
 762     if (part != def_bits) {
 763       if (masm != NULL) {
 764         if (inst_count > 0) {
 765           masm->movk(rd, part, base_shift);
 766         } else {
 767           if (def_bits == 0) {
 768             masm->movz(rd, part, base_shift);
 769           } else {
 770             masm->movn(rd, ~part & mask, base_shift);
 771           }
 772         }
 773       }
 774       inst_count++;
 775     }
 776   }
 777   assert((1 <= inst_count) && (inst_count <= 4), "incorrect number of instructions");
 778   return inst_count;
 779 }
 780 
 781 void MacroAssembler::mov_slow(Register rd, intptr_t c) {
 782 #ifdef ASSERT
 783   int off = offset();
 784 #endif
 785   (void) mov_slow_helper(rd, c, this);
 786   assert(offset() - off == instr_count_for_mov_slow(c) * InstructionSize, "size mismatch");
 787 }
 788 
 789 // Counts instructions generated by mov_slow(rd, c).
 790 int MacroAssembler::instr_count_for_mov_slow(intptr_t c) {
 791   return mov_slow_helper(noreg, c, NULL);
 792 }
 793 
 794 int MacroAssembler::instr_count_for_mov_slow(address c) {
 795   return mov_slow_helper(noreg, (intptr_t)c, NULL);
 796 }
 797 
 798 #else
 799 
 800 void MacroAssembler::mov_slow(Register rd, intptr_t c, AsmCondition cond) {
 801   if (AsmOperand::is_rotated_imm(c)) {
 802     mov(rd, c, cond);
 803   } else if (AsmOperand::is_rotated_imm(~c)) {
 804     mvn(rd, ~c, cond);
 805   } else if (VM_Version::supports_movw()) {
 806     movw(rd, c & 0xffff, cond);
 807     if ((unsigned int)c >> 16) {
 808       movt(rd, (unsigned int)c >> 16, cond);
 809     }
 810   } else {
 811     // Find first non-zero bit
 812     int shift = 0;
 813     while ((c & (3 << shift)) == 0) {
 814       shift += 2;
 815     }
 816     // Put the least significant part of the constant
 817     int mask = 0xff << shift;
 818     mov(rd, c & mask, cond);
 819     // Add up to 3 other parts of the constant;
 820     // each of them can be represented as rotated_imm
 821     if (c & (mask << 8)) {
 822       orr(rd, rd, c & (mask << 8), cond);
 823     }
 824     if (c & (mask << 16)) {
 825       orr(rd, rd, c & (mask << 16), cond);
 826     }
 827     if (c & (mask << 24)) {
 828       orr(rd, rd, c & (mask << 24), cond);
 829     }
 830   }
 831 }
 832 
 833 #endif // AARCH64
 834 
 835 void MacroAssembler::mov_oop(Register rd, jobject o, int oop_index,
 836 #ifdef AARCH64
 837                              bool patchable
 838 #else
 839                              AsmCondition cond
 840 #endif
 841                              ) {
 842 
 843   if (o == NULL) {
 844 #ifdef AARCH64
 845     if (patchable) {
 846       nop();
 847     }
 848     mov(rd, ZR);
 849 #else
 850     mov(rd, 0, cond);
 851 #endif
 852     return;
 853   }
 854 
 855   if (oop_index == 0) {
 856     oop_index = oop_recorder()->allocate_oop_index(o);
 857   }
 858   relocate(oop_Relocation::spec(oop_index));
 859 
 860 #ifdef AARCH64
 861   if (patchable) {
 862     nop();
 863   }
 864   ldr(rd, pc());
 865 #else
 866   if (VM_Version::supports_movw()) {
 867     movw(rd, 0, cond);
 868     movt(rd, 0, cond);
 869   } else {
 870     ldr(rd, Address(PC), cond);
 871     // Extra nop to handle case of large offset of oop placeholder (see NativeMovConstReg::set_data).
 872     nop();
 873   }
 874 #endif
 875 }
 876 
 877 void MacroAssembler::mov_metadata(Register rd, Metadata* o, int metadata_index AARCH64_ONLY_ARG(bool patchable)) {
 878   if (o == NULL) {
 879 #ifdef AARCH64
 880     if (patchable) {
 881       nop();
 882     }
 883 #endif
 884     mov(rd, 0);
 885     return;
 886   }
 887 
 888   if (metadata_index == 0) {
 889     metadata_index = oop_recorder()->allocate_metadata_index(o);
 890   }
 891   relocate(metadata_Relocation::spec(metadata_index));
 892 
 893 #ifdef AARCH64
 894   if (patchable) {
 895     nop();
 896   }
 897 #ifdef COMPILER2
 898   if (!patchable && VM_Version::prefer_moves_over_load_literal()) {
 899     mov_slow(rd, (address)o);
 900     return;
 901   }
 902 #endif
 903   ldr(rd, pc());
 904 #else
 905   if (VM_Version::supports_movw()) {
 906     movw(rd, ((int)o) & 0xffff);
 907     movt(rd, (unsigned int)o >> 16);
 908   } else {
 909     ldr(rd, Address(PC));
 910     // Extra nop to handle case of large offset of metadata placeholder (see NativeMovConstReg::set_data).
 911     nop();
 912   }
 913 #endif // AARCH64
 914 }
 915 
 916 void MacroAssembler::mov_float(FloatRegister fd, jfloat c NOT_AARCH64_ARG(AsmCondition cond)) {
 917   Label skip_constant;
 918   union {
 919     jfloat f;
 920     jint i;
 921   } accessor;
 922   accessor.f = c;
 923 
 924 #ifdef AARCH64
 925   // TODO-AARCH64 - try to optimize loading of float constants with fmov and/or mov_slow
 926   Label L;
 927   ldr_s(fd, target(L));
 928   b(skip_constant);
 929   bind(L);
 930   emit_int32(accessor.i);
 931   bind(skip_constant);
 932 #else
 933   flds(fd, Address(PC), cond);
 934   b(skip_constant);
 935   emit_int32(accessor.i);
 936   bind(skip_constant);
 937 #endif // AARCH64
 938 }
 939 
 940 void MacroAssembler::mov_double(FloatRegister fd, jdouble c NOT_AARCH64_ARG(AsmCondition cond)) {
 941   Label skip_constant;
 942   union {
 943     jdouble d;
 944     jint i[2];
 945   } accessor;
 946   accessor.d = c;
 947 
 948 #ifdef AARCH64
 949   // TODO-AARCH64 - try to optimize loading of double constants with fmov
 950   Label L;
 951   ldr_d(fd, target(L));
 952   b(skip_constant);
 953   align(wordSize);
 954   bind(L);
 955   emit_int32(accessor.i[0]);
 956   emit_int32(accessor.i[1]);
 957   bind(skip_constant);
 958 #else
 959   fldd(fd, Address(PC), cond);
 960   b(skip_constant);
 961   emit_int32(accessor.i[0]);
 962   emit_int32(accessor.i[1]);
 963   bind(skip_constant);
 964 #endif // AARCH64
 965 }
 966 
 967 void MacroAssembler::ldr_global_s32(Register reg, address address_of_global) {
 968   intptr_t addr = (intptr_t) address_of_global;
 969 #ifdef AARCH64
 970   assert((addr & 0x3) == 0, "address should be aligned");
 971 
 972   // FIXME: TODO
 973   if (false && page_reachable_from_cache(address_of_global)) {
 974     assert(false,"TODO: relocate");
 975     //relocate();
 976     adrp(reg, address_of_global);
 977     ldrsw(reg, Address(reg, addr & 0xfff));
 978   } else {
 979     mov_slow(reg, addr & ~0x3fff);
 980     ldrsw(reg, Address(reg, addr & 0x3fff));
 981   }
 982 #else
 983   mov_slow(reg, addr & ~0xfff);
 984   ldr(reg, Address(reg, addr & 0xfff));
 985 #endif
 986 }
 987 
 988 void MacroAssembler::ldr_global_ptr(Register reg, address address_of_global) {
 989 #ifdef AARCH64
 990   intptr_t addr = (intptr_t) address_of_global;
 991   assert ((addr & 0x7) == 0, "address should be aligned");
 992   mov_slow(reg, addr & ~0x7fff);
 993   ldr(reg, Address(reg, addr & 0x7fff));
 994 #else
 995   ldr_global_s32(reg, address_of_global);
 996 #endif
 997 }
 998 
 999 void MacroAssembler::ldrb_global(Register reg, address address_of_global) {
1000   intptr_t addr = (intptr_t) address_of_global;
1001   mov_slow(reg, addr & ~0xfff);
1002   ldrb(reg, Address(reg, addr & 0xfff));
1003 }
1004 
1005 void MacroAssembler::zero_extend(Register rd, Register rn, int bits) {
1006 #ifdef AARCH64
1007   switch (bits) {
1008     case  8: uxtb(rd, rn); break;
1009     case 16: uxth(rd, rn); break;
1010     case 32: mov_w(rd, rn); break;
1011     default: ShouldNotReachHere();
1012   }
1013 #else
1014   if (bits <= 8) {
1015     andr(rd, rn, (1 << bits) - 1);
1016   } else if (bits >= 24) {
1017     bic(rd, rn, -1 << bits);
1018   } else {
1019     mov(rd, AsmOperand(rn, lsl, 32 - bits));
1020     mov(rd, AsmOperand(rd, lsr, 32 - bits));
1021   }
1022 #endif
1023 }
1024 
1025 void MacroAssembler::sign_extend(Register rd, Register rn, int bits) {
1026 #ifdef AARCH64
1027   switch (bits) {
1028     case  8: sxtb(rd, rn); break;
1029     case 16: sxth(rd, rn); break;
1030     case 32: sxtw(rd, rn); break;
1031     default: ShouldNotReachHere();
1032   }
1033 #else
1034   mov(rd, AsmOperand(rn, lsl, 32 - bits));
1035   mov(rd, AsmOperand(rd, asr, 32 - bits));
1036 #endif
1037 }
1038 
1039 #ifndef AARCH64
1040 
1041 void MacroAssembler::long_move(Register rd_lo, Register rd_hi,
1042                                Register rn_lo, Register rn_hi,
1043                                AsmCondition cond) {
1044   if (rd_lo != rn_hi) {
1045     if (rd_lo != rn_lo) { mov(rd_lo, rn_lo, cond); }
1046     if (rd_hi != rn_hi) { mov(rd_hi, rn_hi, cond); }
1047   } else if (rd_hi != rn_lo) {
1048     if (rd_hi != rn_hi) { mov(rd_hi, rn_hi, cond); }
1049     if (rd_lo != rn_lo) { mov(rd_lo, rn_lo, cond); }
1050   } else {
1051     eor(rd_lo, rd_hi, rd_lo, cond);
1052     eor(rd_hi, rd_lo, rd_hi, cond);
1053     eor(rd_lo, rd_hi, rd_lo, cond);
1054   }
1055 }
1056 
1057 void MacroAssembler::long_shift(Register rd_lo, Register rd_hi,
1058                                 Register rn_lo, Register rn_hi,
1059                                 AsmShift shift, Register count) {
1060   Register tmp;
1061   if (rd_lo != rn_lo && rd_lo != rn_hi && rd_lo != count) {
1062     tmp = rd_lo;
1063   } else {
1064     tmp = rd_hi;
1065   }
1066   assert_different_registers(tmp, count, rn_lo, rn_hi);
1067 
1068   subs(tmp, count, 32);
1069   if (shift == lsl) {
1070     assert_different_registers(rd_hi, rn_lo);
1071     assert_different_registers(count, rd_hi);
1072     mov(rd_hi, AsmOperand(rn_lo, shift, tmp), pl);
1073     rsb(tmp, count, 32, mi);
1074     if (rd_hi == rn_hi) {
1075       mov(rd_hi, AsmOperand(rn_hi, lsl, count), mi);
1076       orr(rd_hi, rd_hi, AsmOperand(rn_lo, lsr, tmp), mi);
1077     } else {
1078       mov(rd_hi, AsmOperand(rn_lo, lsr, tmp), mi);
1079       orr(rd_hi, rd_hi, AsmOperand(rn_hi, lsl, count), mi);
1080     }
1081     mov(rd_lo, AsmOperand(rn_lo, shift, count));
1082   } else {
1083     assert_different_registers(rd_lo, rn_hi);
1084     assert_different_registers(rd_lo, count);
1085     mov(rd_lo, AsmOperand(rn_hi, shift, tmp), pl);
1086     rsb(tmp, count, 32, mi);
1087     if (rd_lo == rn_lo) {
1088       mov(rd_lo, AsmOperand(rn_lo, lsr, count), mi);
1089       orr(rd_lo, rd_lo, AsmOperand(rn_hi, lsl, tmp), mi);
1090     } else {
1091       mov(rd_lo, AsmOperand(rn_hi, lsl, tmp), mi);
1092       orr(rd_lo, rd_lo, AsmOperand(rn_lo, lsr, count), mi);
1093     }
1094     mov(rd_hi, AsmOperand(rn_hi, shift, count));
1095   }
1096 }
1097 
1098 void MacroAssembler::long_shift(Register rd_lo, Register rd_hi,
1099                                 Register rn_lo, Register rn_hi,
1100                                 AsmShift shift, int count) {
1101   assert(count != 0 && (count & ~63) == 0, "must be");
1102 
1103   if (shift == lsl) {
1104     assert_different_registers(rd_hi, rn_lo);
1105     if (count >= 32) {
1106       mov(rd_hi, AsmOperand(rn_lo, lsl, count - 32));
1107       mov(rd_lo, 0);
1108     } else {
1109       mov(rd_hi, AsmOperand(rn_hi, lsl, count));
1110       orr(rd_hi, rd_hi, AsmOperand(rn_lo, lsr, 32 - count));
1111       mov(rd_lo, AsmOperand(rn_lo, lsl, count));
1112     }
1113   } else {
1114     assert_different_registers(rd_lo, rn_hi);
1115     if (count >= 32) {
1116       if (count == 32) {
1117         mov(rd_lo, rn_hi);
1118       } else {
1119         mov(rd_lo, AsmOperand(rn_hi, shift, count - 32));
1120       }
1121       if (shift == asr) {
1122         mov(rd_hi, AsmOperand(rn_hi, asr, 0));
1123       } else {
1124         mov(rd_hi, 0);
1125       }
1126     } else {
1127       mov(rd_lo, AsmOperand(rn_lo, lsr, count));
1128       orr(rd_lo, rd_lo, AsmOperand(rn_hi, lsl, 32 - count));
1129       mov(rd_hi, AsmOperand(rn_hi, shift, count));
1130     }
1131   }
1132 }
1133 #endif // !AARCH64
1134 
1135 void MacroAssembler::_verify_oop(Register reg, const char* s, const char* file, int line) {
1136   // This code pattern is matched in NativeIntruction::skip_verify_oop.
1137   // Update it at modifications.
1138   if (!VerifyOops) return;
1139 
1140   char buffer[64];
1141 #ifdef COMPILER1
1142   if (CommentedAssembly) {
1143     snprintf(buffer, sizeof(buffer), "verify_oop at %d", offset());
1144     block_comment(buffer);
1145   }
1146 #endif
1147   const char* msg_buffer = NULL;
1148   {
1149     ResourceMark rm;
1150     stringStream ss;
1151     ss.print("%s at offset %d (%s:%d)", s, offset(), file, line);
1152     msg_buffer = code_string(ss.as_string());
1153   }
1154 
1155   save_all_registers();
1156 
1157   if (reg != R2) {
1158       mov(R2, reg);                              // oop to verify
1159   }
1160   mov(R1, SP);                                   // register save area
1161 
1162   Label done;
1163   InlinedString Lmsg(msg_buffer);
1164   ldr_literal(R0, Lmsg);                         // message
1165 
1166   // call indirectly to solve generation ordering problem
1167   ldr_global_ptr(Rtemp, StubRoutines::verify_oop_subroutine_entry_address());
1168   call(Rtemp);
1169 
1170   restore_all_registers();
1171 
1172   b(done);
1173 #ifdef COMPILER2
1174   int off = offset();
1175 #endif
1176   bind_literal(Lmsg);
1177 #ifdef COMPILER2
1178   if (offset() - off == 1 * wordSize) {
1179     // no padding, so insert nop for worst-case sizing
1180     nop();
1181   }
1182 #endif
1183   bind(done);
1184 }
1185 
1186 void MacroAssembler::_verify_oop_addr(Address addr, const char* s, const char* file, int line) {
1187   if (!VerifyOops) return;
1188 
1189   const char* msg_buffer = NULL;
1190   {
1191     ResourceMark rm;
1192     stringStream ss;
1193     if ((addr.base() == SP) && (addr.index()==noreg)) {
1194       ss.print("verify_oop_addr SP[%d]: %s", (int)addr.disp(), s);
1195     } else {
1196       ss.print("verify_oop_addr: %s", s);
1197     }
1198     ss.print(" (%s:%d)", file, line);
1199     msg_buffer = code_string(ss.as_string());
1200   }
1201 
1202   int push_size = save_all_registers();
1203 
1204   if (addr.base() == SP) {
1205     // computes an addr that takes into account the push
1206     if (addr.index() != noreg) {
1207       Register new_base = addr.index() == R2 ? R1 : R2; // avoid corrupting the index
1208       add(new_base, SP, push_size);
1209       addr = addr.rebase(new_base);
1210     } else {
1211       addr = addr.plus_disp(push_size);
1212     }
1213   }
1214 
1215   ldr(R2, addr);                                 // oop to verify
1216   mov(R1, SP);                                   // register save area
1217 
1218   Label done;
1219   InlinedString Lmsg(msg_buffer);
1220   ldr_literal(R0, Lmsg);                         // message
1221 
1222   // call indirectly to solve generation ordering problem
1223   ldr_global_ptr(Rtemp, StubRoutines::verify_oop_subroutine_entry_address());
1224   call(Rtemp);
1225 
1226   restore_all_registers();
1227 
1228   b(done);
1229   bind_literal(Lmsg);
1230   bind(done);
1231 }
1232 
1233 void MacroAssembler::null_check(Register reg, Register tmp, int offset) {
1234   if (needs_explicit_null_check(offset)) {
1235 #ifdef AARCH64
1236     ldr(ZR, Address(reg));
1237 #else
1238     assert_different_registers(reg, tmp);
1239     if (tmp == noreg) {
1240       tmp = Rtemp;
1241       assert((! Thread::current()->is_Compiler_thread()) ||
1242              (! (ciEnv::current()->task() == NULL)) ||
1243              (! (ciEnv::current()->comp_level() == CompLevel_full_optimization)),
1244              "Rtemp not available in C2"); // explicit tmp register required
1245       // XXX: could we mark the code buffer as not compatible with C2 ?
1246     }
1247     ldr(tmp, Address(reg));
1248 #endif
1249   }
1250 }
1251 
1252 // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
1253 void MacroAssembler::eden_allocate(Register obj, Register obj_end, Register tmp1, Register tmp2,
1254                                  RegisterOrConstant size_expression, Label& slow_case) {
1255   if (!Universe::heap()->supports_inline_contig_alloc()) {
1256     b(slow_case);
1257     return;
1258   }
1259 
1260   CollectedHeap* ch = Universe::heap();
1261 
1262   const Register top_addr = tmp1;
1263   const Register heap_end = tmp2;
1264 
1265   if (size_expression.is_register()) {
1266     assert_different_registers(obj, obj_end, top_addr, heap_end, size_expression.as_register());
1267   } else {
1268     assert_different_registers(obj, obj_end, top_addr, heap_end);
1269   }
1270 
1271   bool load_const = AARCH64_ONLY(false) NOT_AARCH64(VM_Version::supports_movw() ); // TODO-AARCH64 check performance
1272   if (load_const) {
1273     mov_address(top_addr, (address)Universe::heap()->top_addr(), symbolic_Relocation::eden_top_reference);
1274   } else {
1275     ldr(top_addr, Address(Rthread, JavaThread::heap_top_addr_offset()));
1276   }
1277   // Calculate new heap_top by adding the size of the object
1278   Label retry;
1279   bind(retry);
1280 
1281 #ifdef AARCH64
1282   ldxr(obj, top_addr);
1283 #else
1284   ldr(obj, Address(top_addr));
1285 #endif // AARCH64
1286 
1287   ldr(heap_end, Address(top_addr, (intptr_t)ch->end_addr() - (intptr_t)ch->top_addr()));
1288   add_rc(obj_end, obj, size_expression);
1289   // Check if obj_end wrapped around, i.e., obj_end < obj. If yes, jump to the slow case.
1290   cmp(obj_end, obj);
1291   b(slow_case, lo);
1292   // Update heap_top if allocation succeeded
1293   cmp(obj_end, heap_end);
1294   b(slow_case, hi);
1295 
1296 #ifdef AARCH64
1297   stxr(heap_end/*scratched*/, obj_end, top_addr);
1298   cbnz_w(heap_end, retry);
1299 #else
1300   atomic_cas_bool(obj, obj_end, top_addr, 0, heap_end/*scratched*/);
1301   b(retry, ne);
1302 #endif // AARCH64
1303 }
1304 
1305 // Puts address of allocated object into register `obj` and end of allocated object into register `obj_end`.
1306 void MacroAssembler::tlab_allocate(Register obj, Register obj_end, Register tmp1,
1307                                  RegisterOrConstant size_expression, Label& slow_case) {
1308   const Register tlab_end = tmp1;
1309   assert_different_registers(obj, obj_end, tlab_end);
1310 
1311   ldr(obj, Address(Rthread, JavaThread::tlab_top_offset()));
1312   ldr(tlab_end, Address(Rthread, JavaThread::tlab_end_offset()));
1313   add_rc(obj_end, obj, size_expression);
1314   cmp(obj_end, tlab_end);
1315   b(slow_case, hi);
1316   str(obj_end, Address(Rthread, JavaThread::tlab_top_offset()));
1317 }
1318 
1319 void MacroAssembler::tlab_refill(Register top, Register tmp1, Register tmp2,
1320                                  Register tmp3, Register tmp4,
1321                                Label& try_eden, Label& slow_case) {
1322   if (!Universe::heap()->supports_inline_contig_alloc()) {
1323     b(slow_case);
1324     return;
1325   }
1326 
1327   InlinedAddress intArrayKlass_addr((address)Universe::intArrayKlassObj_addr());
1328   Label discard_tlab, do_refill;
1329   ldr(top,  Address(Rthread, JavaThread::tlab_top_offset()));
1330   ldr(tmp1, Address(Rthread, JavaThread::tlab_end_offset()));
1331   ldr(tmp2, Address(Rthread, JavaThread::tlab_refill_waste_limit_offset()));
1332 
1333   // Calculate amount of free space
1334   sub(tmp1, tmp1, top);
1335   // Retain tlab and allocate in shared space
1336   // if the amount of free space in tlab is too large to discard
1337   cmp(tmp2, AsmOperand(tmp1, lsr, LogHeapWordSize));
1338   b(discard_tlab, ge);
1339 
1340   // Increment waste limit to prevent getting stuck on this slow path
1341   mov_slow(tmp3, ThreadLocalAllocBuffer::refill_waste_limit_increment());
1342   add(tmp2, tmp2, tmp3);
1343   str(tmp2, Address(Rthread, JavaThread::tlab_refill_waste_limit_offset()));
1344   if (TLABStats) {
1345     ldr_u32(tmp2, Address(Rthread, JavaThread::tlab_slow_allocations_offset()));
1346     add_32(tmp2, tmp2, 1);
1347     str_32(tmp2, Address(Rthread, JavaThread::tlab_slow_allocations_offset()));
1348   }
1349   b(try_eden);
1350   bind_literal(intArrayKlass_addr);
1351 
1352   bind(discard_tlab);
1353   if (TLABStats) {
1354     ldr_u32(tmp2, Address(Rthread, JavaThread::tlab_number_of_refills_offset()));
1355     ldr_u32(tmp3, Address(Rthread, JavaThread::tlab_fast_refill_waste_offset()));
1356     add_32(tmp2, tmp2, 1);
1357     add_32(tmp3, tmp3, AsmOperand(tmp1, lsr, LogHeapWordSize));
1358     str_32(tmp2, Address(Rthread, JavaThread::tlab_number_of_refills_offset()));
1359     str_32(tmp3, Address(Rthread, JavaThread::tlab_fast_refill_waste_offset()));
1360   }
1361   // If tlab is currently allocated (top or end != null)
1362   // then fill [top, end + alignment_reserve) with array object
1363   cbz(top, do_refill);
1364 
1365   // Set up the mark word
1366   mov_slow(tmp2, (intptr_t)markOopDesc::prototype()->copy_set_hash(0x2));
1367   str(tmp2, Address(top, oopDesc::mark_offset_in_bytes()));
1368   // Set klass to intArrayKlass and the length to the remaining space
1369   ldr_literal(tmp2, intArrayKlass_addr);
1370   add(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes() -
1371       typeArrayOopDesc::header_size(T_INT) * HeapWordSize);
1372   Register klass = tmp2;
1373   ldr(klass, Address(tmp2));
1374   logical_shift_right(tmp1, tmp1, LogBytesPerInt); // divide by sizeof(jint)
1375   str_32(tmp1, Address(top, arrayOopDesc::length_offset_in_bytes()));
1376   store_klass(klass, top); // blows klass:
1377   klass = noreg;
1378 
1379   ldr(tmp1, Address(Rthread, JavaThread::tlab_start_offset()));
1380   sub(tmp1, top, tmp1); // size of tlab's allocated portion
1381   incr_allocated_bytes(tmp1, tmp2);
1382 
1383   bind(do_refill);
1384   // Refill the tlab with an eden allocation
1385   ldr(tmp1, Address(Rthread, JavaThread::tlab_size_offset()));
1386   logical_shift_left(tmp4, tmp1, LogHeapWordSize);
1387   eden_allocate(top, tmp1, tmp2, tmp3, tmp4, slow_case);
1388   str(top, Address(Rthread, JavaThread::tlab_start_offset()));
1389   str(top, Address(Rthread, JavaThread::tlab_top_offset()));
1390 
1391 #ifdef ASSERT
1392   // Verify that tmp1 contains tlab_end
1393   ldr(tmp2, Address(Rthread, JavaThread::tlab_size_offset()));
1394   add(tmp2, top, AsmOperand(tmp2, lsl, LogHeapWordSize));
1395   cmp(tmp1, tmp2);
1396   breakpoint(ne);
1397 #endif
1398 
1399   sub(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
1400   str(tmp1, Address(Rthread, JavaThread::tlab_end_offset()));
1401 
1402   if (ZeroTLAB) {
1403     // clobbers start and tmp
1404     // top must be preserved!
1405     add(tmp1, tmp1, ThreadLocalAllocBuffer::alignment_reserve_in_bytes());
1406     ldr(tmp2, Address(Rthread, JavaThread::tlab_start_offset()));
1407     zero_memory(tmp2, tmp1, tmp3);
1408   }
1409 }
1410 
1411 // Fills memory regions [start..end] with zeroes. Clobbers `start` and `tmp` registers.
1412 void MacroAssembler::zero_memory(Register start, Register end, Register tmp) {
1413   Label loop;
1414   const Register ptr = start;
1415 
1416 #ifdef AARCH64
1417   // TODO-AARCH64 - compare performance of 2x word zeroing with simple 1x
1418   const Register size = tmp;
1419   Label remaining, done;
1420 
1421   sub(size, end, start);
1422 
1423 #ifdef ASSERT
1424   { Label L;
1425     tst(size, wordSize - 1);
1426     b(L, eq);
1427     stop("size is not a multiple of wordSize");
1428     bind(L);
1429   }
1430 #endif // ASSERT
1431 
1432   subs(size, size, wordSize);
1433   b(remaining, le);
1434 
1435   // Zero by 2 words per iteration.
1436   bind(loop);
1437   subs(size, size, 2*wordSize);
1438   stp(ZR, ZR, Address(ptr, 2*wordSize, post_indexed));
1439   b(loop, gt);
1440 
1441   bind(remaining);
1442   b(done, ne);
1443   str(ZR, Address(ptr));
1444   bind(done);
1445 #else
1446   mov(tmp, 0);
1447   bind(loop);
1448   cmp(ptr, end);
1449   str(tmp, Address(ptr, wordSize, post_indexed), lo);
1450   b(loop, lo);
1451 #endif // AARCH64
1452 }
1453 
1454 void MacroAssembler::incr_allocated_bytes(RegisterOrConstant size_in_bytes, Register tmp) {
1455 #ifdef AARCH64
1456   ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
1457   add_rc(tmp, tmp, size_in_bytes);
1458   str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
1459 #else
1460   // Bump total bytes allocated by this thread
1461   Label done;
1462 
1463   ldr(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
1464   adds(tmp, tmp, size_in_bytes);
1465   str(tmp, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())), cc);
1466   b(done, cc);
1467 
1468   // Increment the high word and store single-copy atomically (that is an unlikely scenario on typical embedded systems as it means >4GB has been allocated)
1469   // To do so ldrd/strd instructions used which require an even-odd pair of registers. Such a request could be difficult to satisfy by
1470   // allocating those registers on a higher level, therefore the routine is ready to allocate a pair itself.
1471   Register low, high;
1472   // Select ether R0/R1 or R2/R3
1473 
1474   if (size_in_bytes.is_register() && (size_in_bytes.as_register() == R0 || size_in_bytes.as_register() == R1)) {
1475     low = R2;
1476     high  = R3;
1477   } else {
1478     low = R0;
1479     high  = R1;
1480   }
1481   push(RegisterSet(low, high));
1482 
1483   ldrd(low, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
1484   adds(low, low, size_in_bytes);
1485   adc(high, high, 0);
1486   strd(low, Address(Rthread, in_bytes(JavaThread::allocated_bytes_offset())));
1487 
1488   pop(RegisterSet(low, high));
1489 
1490   bind(done);
1491 #endif // AARCH64
1492 }
1493 
1494 void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) {
1495   // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM
1496   if (UseStackBanging) {
1497     const int page_size = os::vm_page_size();
1498 
1499     sub_slow(tmp, SP, JavaThread::stack_shadow_zone_size());
1500     strb(R0, Address(tmp));
1501 #ifdef AARCH64
1502     for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= page_size) {
1503       sub(tmp, tmp, page_size);
1504       strb(R0, Address(tmp));
1505     }
1506 #else
1507     for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) {
1508       strb(R0, Address(tmp, -0xff0, pre_indexed));
1509     }
1510 #endif // AARCH64
1511   }
1512 }
1513 
1514 void MacroAssembler::arm_stack_overflow_check(Register Rsize, Register tmp) {
1515   if (UseStackBanging) {
1516     Label loop;
1517 
1518     mov(tmp, SP);
1519     add_slow(Rsize, Rsize, JavaThread::stack_shadow_zone_size() - os::vm_page_size());
1520 #ifdef AARCH64
1521     sub(tmp, tmp, Rsize);
1522     bind(loop);
1523     subs(Rsize, Rsize, os::vm_page_size());
1524     strb(ZR, Address(tmp, Rsize));
1525 #else
1526     bind(loop);
1527     subs(Rsize, Rsize, 0xff0);
1528     strb(R0, Address(tmp, -0xff0, pre_indexed));
1529 #endif // AARCH64
1530     b(loop, hi);
1531   }
1532 }
1533 
1534 void MacroAssembler::stop(const char* msg) {
1535   // This code pattern is matched in NativeIntruction::is_stop.
1536   // Update it at modifications.
1537 #ifdef COMPILER1
1538   if (CommentedAssembly) {
1539     block_comment("stop");
1540   }
1541 #endif
1542 
1543   InlinedAddress Ldebug(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
1544   InlinedString Lmsg(msg);
1545 
1546   // save all registers for further inspection
1547   save_all_registers();
1548 
1549   ldr_literal(R0, Lmsg);                     // message
1550   mov(R1, SP);                               // register save area
1551 
1552 #ifdef AARCH64
1553   ldr_literal(Rtemp, Ldebug);
1554   br(Rtemp);
1555 #else
1556   ldr_literal(PC, Ldebug);                   // call MacroAssembler::debug
1557 #endif // AARCH64
1558 
1559 #if defined(COMPILER2) && defined(AARCH64)
1560   int off = offset();
1561 #endif
1562   bind_literal(Lmsg);
1563   bind_literal(Ldebug);
1564 #if defined(COMPILER2) && defined(AARCH64)
1565   if (offset() - off == 2 * wordSize) {
1566     // no padding, so insert nop for worst-case sizing
1567     nop();
1568   }
1569 #endif
1570 }
1571 
1572 void MacroAssembler::warn(const char* msg) {
1573 #ifdef COMPILER1
1574   if (CommentedAssembly) {
1575     block_comment("warn");
1576   }
1577 #endif
1578 
1579   InlinedAddress Lwarn(CAST_FROM_FN_PTR(address, warning));
1580   InlinedString Lmsg(msg);
1581   Label done;
1582 
1583   int push_size = save_caller_save_registers();
1584 
1585 #ifdef AARCH64
1586   // TODO-AARCH64 - get rid of extra debug parameters
1587   mov(R1, LR);
1588   mov(R2, FP);
1589   add(R3, SP, push_size);
1590 #endif
1591 
1592   ldr_literal(R0, Lmsg);                    // message
1593   ldr_literal(LR, Lwarn);                   // call warning
1594 
1595   call(LR);
1596 
1597   restore_caller_save_registers();
1598 
1599   b(done);
1600   bind_literal(Lmsg);
1601   bind_literal(Lwarn);
1602   bind(done);
1603 }
1604 
1605 
1606 int MacroAssembler::save_all_registers() {
1607   // This code pattern is matched in NativeIntruction::is_save_all_registers.
1608   // Update it at modifications.
1609 #ifdef AARCH64
1610   const Register tmp = Rtemp;
1611   raw_push(R30, ZR);
1612   for (int i = 28; i >= 0; i -= 2) {
1613       raw_push(as_Register(i), as_Register(i+1));
1614   }
1615   mov_pc_to(tmp);
1616   str(tmp, Address(SP, 31*wordSize));
1617   ldr(tmp, Address(SP, tmp->encoding()*wordSize));
1618   return 32*wordSize;
1619 #else
1620   push(RegisterSet(R0, R12) | RegisterSet(LR) | RegisterSet(PC));
1621   return 15*wordSize;
1622 #endif // AARCH64
1623 }
1624 
1625 void MacroAssembler::restore_all_registers() {
1626 #ifdef AARCH64
1627   for (int i = 0; i <= 28; i += 2) {
1628     raw_pop(as_Register(i), as_Register(i+1));
1629   }
1630   raw_pop(R30, ZR);
1631 #else
1632   pop(RegisterSet(R0, R12) | RegisterSet(LR));   // restore registers
1633   add(SP, SP, wordSize);                         // discard saved PC
1634 #endif // AARCH64
1635 }
1636 
1637 int MacroAssembler::save_caller_save_registers() {
1638 #ifdef AARCH64
1639   for (int i = 0; i <= 16; i += 2) {
1640     raw_push(as_Register(i), as_Register(i+1));
1641   }
1642   raw_push(R18, LR);
1643   return 20*wordSize;
1644 #else
1645 #if R9_IS_SCRATCHED
1646   // Save also R10 to preserve alignment
1647   push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR) | RegisterSet(R9,R10));
1648   return 8*wordSize;
1649 #else
1650   push(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
1651   return 6*wordSize;
1652 #endif
1653 #endif // AARCH64
1654 }
1655 
1656 void MacroAssembler::restore_caller_save_registers() {
1657 #ifdef AARCH64
1658   raw_pop(R18, LR);
1659   for (int i = 16; i >= 0; i -= 2) {
1660     raw_pop(as_Register(i), as_Register(i+1));
1661   }
1662 #else
1663 #if R9_IS_SCRATCHED
1664   pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR) | RegisterSet(R9,R10));
1665 #else
1666   pop(RegisterSet(R0, R3) | RegisterSet(R12) | RegisterSet(LR));
1667 #endif
1668 #endif // AARCH64
1669 }
1670 
1671 void MacroAssembler::debug(const char* msg, const intx* registers) {
1672   // In order to get locks to work, we need to fake a in_VM state
1673   JavaThread* thread = JavaThread::current();
1674   thread->set_thread_state(_thread_in_vm);
1675 
1676   if (ShowMessageBoxOnError) {
1677     ttyLocker ttyl;
1678     if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
1679       BytecodeCounter::print();
1680     }
1681     if (os::message_box(msg, "Execution stopped, print registers?")) {
1682 #ifdef AARCH64
1683       // saved registers: R0-R30, PC
1684       const int nregs = 32;
1685 #else
1686       // saved registers: R0-R12, LR, PC
1687       const int nregs = 15;
1688       const Register regs[nregs] = {R0, R1, R2, R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, LR, PC};
1689 #endif // AARCH64
1690 
1691       for (int i = 0; i < nregs AARCH64_ONLY(-1); i++) {
1692         tty->print_cr("%s = " INTPTR_FORMAT, AARCH64_ONLY(as_Register(i)) NOT_AARCH64(regs[i])->name(), registers[i]);
1693       }
1694 
1695 #ifdef AARCH64
1696       tty->print_cr("pc = " INTPTR_FORMAT, registers[nregs-1]);
1697 #endif // AARCH64
1698 
1699       // derive original SP value from the address of register save area
1700       tty->print_cr("%s = " INTPTR_FORMAT, SP->name(), p2i(&registers[nregs]));
1701     }
1702     BREAKPOINT;
1703   } else {
1704     ::tty->print_cr("=============== DEBUG MESSAGE: %s ================\n", msg);
1705   }
1706   assert(false, "DEBUG MESSAGE: %s", msg);
1707   fatal("%s", msg); // returning from MacroAssembler::debug is not supported
1708 }
1709 
1710 void MacroAssembler::unimplemented(const char* what) {
1711   const char* buf = NULL;
1712   {
1713     ResourceMark rm;
1714     stringStream ss;
1715     ss.print("unimplemented: %s", what);
1716     buf = code_string(ss.as_string());
1717   }
1718   stop(buf);
1719 }
1720 
1721 
1722 // Implementation of FixedSizeCodeBlock
1723 
1724 FixedSizeCodeBlock::FixedSizeCodeBlock(MacroAssembler* masm, int size_in_instrs, bool enabled) :
1725 _masm(masm), _start(masm->pc()), _size_in_instrs(size_in_instrs), _enabled(enabled) {
1726 }
1727 
1728 FixedSizeCodeBlock::~FixedSizeCodeBlock() {
1729   if (_enabled) {
1730     address curr_pc = _masm->pc();
1731 
1732     assert(_start < curr_pc, "invalid current pc");
1733     guarantee(curr_pc <= _start + _size_in_instrs * Assembler::InstructionSize, "code block is too long");
1734 
1735     int nops_count = (_start - curr_pc) / Assembler::InstructionSize + _size_in_instrs;
1736     for (int i = 0; i < nops_count; i++) {
1737       _masm->nop();
1738     }
1739   }
1740 }
1741 
1742 #ifdef AARCH64
1743 
1744 // Serializes memory.
1745 // tmp register is not used on AArch64, this parameter is provided solely for better compatibility with 32-bit ARM
1746 void MacroAssembler::membar(Membar_mask_bits order_constraint, Register tmp) {
1747   if (!os::is_MP()) return;
1748 
1749   // TODO-AARCH64 investigate dsb vs dmb effects
1750   if (order_constraint == StoreStore) {
1751     dmb(DMB_st);
1752   } else if ((order_constraint & ~(LoadLoad | LoadStore)) == 0) {
1753     dmb(DMB_ld);
1754   } else {
1755     dmb(DMB_all);
1756   }
1757 }
1758 
1759 #else
1760 
1761 // Serializes memory. Potentially blows flags and reg.
1762 // tmp is a scratch for v6 co-processor write op (could be noreg for other architecure versions)
1763 // preserve_flags takes a longer path in LoadStore case (dmb rather then control dependency) to preserve status flags. Optional.
1764 // load_tgt is an ordered load target in a LoadStore case only, to create dependency between the load operation and conditional branch. Optional.
1765 void MacroAssembler::membar(Membar_mask_bits order_constraint,
1766                             Register tmp,
1767                             bool preserve_flags,
1768                             Register load_tgt) {
1769   if (!os::is_MP()) return;
1770 
1771   if (order_constraint == StoreStore) {
1772     dmb(DMB_st, tmp);
1773   } else if ((order_constraint & StoreLoad)  ||
1774              (order_constraint & LoadLoad)   ||
1775              (order_constraint & StoreStore) ||
1776              (load_tgt == noreg)             ||
1777              preserve_flags) {
1778     dmb(DMB_all, tmp);
1779   } else {
1780     // LoadStore: speculative stores reordeing is prohibited
1781 
1782     // By providing an ordered load target register, we avoid an extra memory load reference
1783     Label not_taken;
1784     bind(not_taken);
1785     cmp(load_tgt, load_tgt);
1786     b(not_taken, ne);
1787   }
1788 }
1789 
1790 #endif // AARCH64
1791 
1792 // If "allow_fallthrough_on_failure" is false, we always branch to "slow_case"
1793 // on failure, so fall-through can only mean success.
1794 // "one_shot" controls whether we loop and retry to mitigate spurious failures.
1795 // This is only needed for C2, which for some reason does not rety,
1796 // while C1/interpreter does.
1797 // TODO: measure if it makes a difference
1798 
1799 void MacroAssembler::cas_for_lock_acquire(Register oldval, Register newval,
1800   Register base, Register tmp, Label &slow_case,
1801   bool allow_fallthrough_on_failure, bool one_shot)
1802 {
1803 
1804   bool fallthrough_is_success = false;
1805 
1806   // ARM Litmus Test example does prefetching here.
1807   // TODO: investigate if it helps performance
1808 
1809   // The last store was to the displaced header, so to prevent
1810   // reordering we must issue a StoreStore or Release barrier before
1811   // the CAS store.
1812 
1813 #ifdef AARCH64
1814 
1815   Register Rscratch = tmp;
1816   Register Roop = base;
1817   Register mark = oldval;
1818   Register Rbox = newval;
1819   Label loop;
1820 
1821   assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
1822 
1823   // Instead of StoreStore here, we use store-release-exclusive below
1824 
1825   bind(loop);
1826 
1827   ldaxr(tmp, base);  // acquire
1828   cmp(tmp, oldval);
1829   b(slow_case, ne);
1830   stlxr(tmp, newval, base); // release
1831   if (one_shot) {
1832     cmp_w(tmp, 0);
1833   } else {
1834     cbnz_w(tmp, loop);
1835     fallthrough_is_success = true;
1836   }
1837 
1838   // MemBarAcquireLock would normally go here, but
1839   // we already do ldaxr+stlxr above, which has
1840   // Sequential Consistency
1841 
1842 #else
1843   membar(MacroAssembler::StoreStore, noreg);
1844 
1845   if (one_shot) {
1846     ldrex(tmp, Address(base, oopDesc::mark_offset_in_bytes()));
1847     cmp(tmp, oldval);
1848     strex(tmp, newval, Address(base, oopDesc::mark_offset_in_bytes()), eq);
1849     cmp(tmp, 0, eq);
1850   } else {
1851     atomic_cas_bool(oldval, newval, base, oopDesc::mark_offset_in_bytes(), tmp);
1852   }
1853 
1854   // MemBarAcquireLock barrier
1855   // According to JSR-133 Cookbook, this should be LoadLoad | LoadStore,
1856   // but that doesn't prevent a load or store from floating up between
1857   // the load and store in the CAS sequence, so play it safe and
1858   // do a full fence.
1859   membar(Membar_mask_bits(LoadLoad | LoadStore | StoreStore | StoreLoad), noreg);
1860 #endif
1861   if (!fallthrough_is_success && !allow_fallthrough_on_failure) {
1862     b(slow_case, ne);
1863   }
1864 }
1865 
1866 void MacroAssembler::cas_for_lock_release(Register oldval, Register newval,
1867   Register base, Register tmp, Label &slow_case,
1868   bool allow_fallthrough_on_failure, bool one_shot)
1869 {
1870 
1871   bool fallthrough_is_success = false;
1872 
1873   assert_different_registers(oldval,newval,base,tmp);
1874 
1875 #ifdef AARCH64
1876   Label loop;
1877 
1878   assert(oopDesc::mark_offset_in_bytes() == 0, "must be");
1879 
1880   bind(loop);
1881   ldxr(tmp, base);
1882   cmp(tmp, oldval);
1883   b(slow_case, ne);
1884   // MemBarReleaseLock barrier
1885   stlxr(tmp, newval, base);
1886   if (one_shot) {
1887     cmp_w(tmp, 0);
1888   } else {
1889     cbnz_w(tmp, loop);
1890     fallthrough_is_success = true;
1891   }
1892 #else
1893   // MemBarReleaseLock barrier
1894   // According to JSR-133 Cookbook, this should be StoreStore | LoadStore,
1895   // but that doesn't prevent a load or store from floating down between
1896   // the load and store in the CAS sequence, so play it safe and
1897   // do a full fence.
1898   membar(Membar_mask_bits(LoadLoad | LoadStore | StoreStore | StoreLoad), tmp);
1899 
1900   if (one_shot) {
1901     ldrex(tmp, Address(base, oopDesc::mark_offset_in_bytes()));
1902     cmp(tmp, oldval);
1903     strex(tmp, newval, Address(base, oopDesc::mark_offset_in_bytes()), eq);
1904     cmp(tmp, 0, eq);
1905   } else {
1906     atomic_cas_bool(oldval, newval, base, oopDesc::mark_offset_in_bytes(), tmp);
1907   }
1908 #endif
1909   if (!fallthrough_is_success && !allow_fallthrough_on_failure) {
1910     b(slow_case, ne);
1911   }
1912 
1913   // ExitEnter
1914   // According to JSR-133 Cookbook, this should be StoreLoad, the same
1915   // barrier that follows volatile store.
1916   // TODO: Should be able to remove on armv8 if volatile loads
1917   // use the load-acquire instruction.
1918   membar(StoreLoad, noreg);
1919 }
1920 
1921 #ifndef PRODUCT
1922 
1923 // Preserves flags and all registers.
1924 // On SMP the updated value might not be visible to external observers without a sychronization barrier
1925 void MacroAssembler::cond_atomic_inc32(AsmCondition cond, int* counter_addr) {
1926   if (counter_addr != NULL) {
1927     InlinedAddress counter_addr_literal((address)counter_addr);
1928     Label done, retry;
1929     if (cond != al) {
1930       b(done, inverse(cond));
1931     }
1932 
1933 #ifdef AARCH64
1934     raw_push(R0, R1);
1935     raw_push(R2, ZR);
1936 
1937     ldr_literal(R0, counter_addr_literal);
1938 
1939     bind(retry);
1940     ldxr_w(R1, R0);
1941     add_w(R1, R1, 1);
1942     stxr_w(R2, R1, R0);
1943     cbnz_w(R2, retry);
1944 
1945     raw_pop(R2, ZR);
1946     raw_pop(R0, R1);
1947 #else
1948     push(RegisterSet(R0, R3) | RegisterSet(Rtemp));
1949     ldr_literal(R0, counter_addr_literal);
1950 
1951     mrs(CPSR, Rtemp);
1952 
1953     bind(retry);
1954     ldr_s32(R1, Address(R0));
1955     add(R2, R1, 1);
1956     atomic_cas_bool(R1, R2, R0, 0, R3);
1957     b(retry, ne);
1958 
1959     msr(CPSR_fsxc, Rtemp);
1960 
1961     pop(RegisterSet(R0, R3) | RegisterSet(Rtemp));
1962 #endif // AARCH64
1963 
1964     b(done);
1965     bind_literal(counter_addr_literal);
1966 
1967     bind(done);
1968   }
1969 }
1970 
1971 #endif // !PRODUCT
1972 
1973 
1974 // Building block for CAS cases of biased locking: makes CAS and records statistics.
1975 // The slow_case label is used to transfer control if CAS fails. Otherwise leaves condition codes set.
1976 void MacroAssembler::biased_locking_enter_with_cas(Register obj_reg, Register old_mark_reg, Register new_mark_reg,
1977                                                  Register tmp, Label& slow_case, int* counter_addr) {
1978 
1979   cas_for_lock_acquire(old_mark_reg, new_mark_reg, obj_reg, tmp, slow_case);
1980 #ifdef ASSERT
1981   breakpoint(ne); // Fallthrough only on success
1982 #endif
1983 #ifndef PRODUCT
1984   if (counter_addr != NULL) {
1985     cond_atomic_inc32(al, counter_addr);
1986   }
1987 #endif // !PRODUCT
1988 }
1989 
1990 int MacroAssembler::biased_locking_enter(Register obj_reg, Register swap_reg, Register tmp_reg,
1991                                          bool swap_reg_contains_mark,
1992                                          Register tmp2,
1993                                          Label& done, Label& slow_case,
1994                                          BiasedLockingCounters* counters) {
1995   // obj_reg must be preserved (at least) if the bias locking fails
1996   // tmp_reg is a temporary register
1997   // swap_reg was used as a temporary but contained a value
1998   //   that was used afterwards in some call pathes. Callers
1999   //   have been fixed so that swap_reg no longer needs to be
2000   //   saved.
2001   // Rtemp in no longer scratched
2002 
2003   assert(UseBiasedLocking, "why call this otherwise?");
2004   assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp2);
2005   guarantee(swap_reg!=tmp_reg, "invariant");
2006   assert(tmp_reg != noreg, "must supply tmp_reg");
2007 
2008 #ifndef PRODUCT
2009   if (PrintBiasedLockingStatistics && (counters == NULL)) {
2010     counters = BiasedLocking::counters();
2011   }
2012 #endif
2013 
2014   assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
2015   Address mark_addr(obj_reg, oopDesc::mark_offset_in_bytes());
2016 
2017   // Biased locking
2018   // See whether the lock is currently biased toward our thread and
2019   // whether the epoch is still valid
2020   // Note that the runtime guarantees sufficient alignment of JavaThread
2021   // pointers to allow age to be placed into low bits
2022   // First check to see whether biasing is even enabled for this object
2023   Label cas_label;
2024 
2025   // The null check applies to the mark loading, if we need to load it.
2026   // If the mark has already been loaded in swap_reg then it has already
2027   // been performed and the offset is irrelevant.
2028   int null_check_offset = offset();
2029   if (!swap_reg_contains_mark) {
2030     ldr(swap_reg, mark_addr);
2031   }
2032 
2033   // On MP platform loads could return 'stale' values in some cases.
2034   // That is acceptable since either CAS or slow case path is taken in the worst case.
2035 
2036   andr(tmp_reg, swap_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
2037   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
2038 
2039   b(cas_label, ne);
2040 
2041   // The bias pattern is present in the object's header. Need to check
2042   // whether the bias owner and the epoch are both still current.
2043   load_klass(tmp_reg, obj_reg);
2044   ldr(tmp_reg, Address(tmp_reg, Klass::prototype_header_offset()));
2045   orr(tmp_reg, tmp_reg, Rthread);
2046   eor(tmp_reg, tmp_reg, swap_reg);
2047 
2048 #ifdef AARCH64
2049   ands(tmp_reg, tmp_reg, ~((uintx) markOopDesc::age_mask_in_place));
2050 #else
2051   bics(tmp_reg, tmp_reg, ((int) markOopDesc::age_mask_in_place));
2052 #endif // AARCH64
2053 
2054 #ifndef PRODUCT
2055   if (counters != NULL) {
2056     cond_atomic_inc32(eq, counters->biased_lock_entry_count_addr());
2057   }
2058 #endif // !PRODUCT
2059 
2060   b(done, eq);
2061 
2062   Label try_revoke_bias;
2063   Label try_rebias;
2064 
2065   // At this point we know that the header has the bias pattern and
2066   // that we are not the bias owner in the current epoch. We need to
2067   // figure out more details about the state of the header in order to
2068   // know what operations can be legally performed on the object's
2069   // header.
2070 
2071   // If the low three bits in the xor result aren't clear, that means
2072   // the prototype header is no longer biased and we have to revoke
2073   // the bias on this object.
2074   tst(tmp_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
2075   b(try_revoke_bias, ne);
2076 
2077   // Biasing is still enabled for this data type. See whether the
2078   // epoch of the current bias is still valid, meaning that the epoch
2079   // bits of the mark word are equal to the epoch bits of the
2080   // prototype header. (Note that the prototype header's epoch bits
2081   // only change at a safepoint.) If not, attempt to rebias the object
2082   // toward the current thread. Note that we must be absolutely sure
2083   // that the current epoch is invalid in order to do this because
2084   // otherwise the manipulations it performs on the mark word are
2085   // illegal.
2086   tst(tmp_reg, (uintx)markOopDesc::epoch_mask_in_place);
2087   b(try_rebias, ne);
2088 
2089   // tmp_reg has the age, epoch and pattern bits cleared
2090   // The remaining (owner) bits are (Thread ^ current_owner)
2091 
2092   // The epoch of the current bias is still valid but we know nothing
2093   // about the owner; it might be set or it might be clear. Try to
2094   // acquire the bias of the object using an atomic operation. If this
2095   // fails we will go in to the runtime to revoke the object's bias.
2096   // Note that we first construct the presumed unbiased header so we
2097   // don't accidentally blow away another thread's valid bias.
2098 
2099   // Note that we know the owner is not ourself. Hence, success can
2100   // only happen when the owner bits is 0
2101 
2102 #ifdef AARCH64
2103   // Bit mask biased_lock + age + epoch is not a valid AArch64 logical immediate, as it has
2104   // cleared bit in the middle (cms bit). So it is loaded with separate instruction.
2105   mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
2106   andr(swap_reg, swap_reg, tmp2);
2107 #else
2108   // until the assembler can be made smarter, we need to make some assumptions about the values
2109   // so we can optimize this:
2110   assert((markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place) == 0x1ff, "biased bitmasks changed");
2111 
2112   mov(swap_reg, AsmOperand(swap_reg, lsl, 23));
2113   mov(swap_reg, AsmOperand(swap_reg, lsr, 23)); // markOop with thread bits cleared (for CAS)
2114 #endif // AARCH64
2115 
2116   orr(tmp_reg, swap_reg, Rthread); // new mark
2117 
2118   biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, slow_case,
2119         (counters != NULL) ? counters->anonymously_biased_lock_entry_count_addr() : NULL);
2120 
2121   // If the biasing toward our thread failed, this means that
2122   // another thread succeeded in biasing it toward itself and we
2123   // need to revoke that bias. The revocation will occur in the
2124   // interpreter runtime in the slow case.
2125 
2126   b(done);
2127 
2128   bind(try_rebias);
2129 
2130   // At this point we know the epoch has expired, meaning that the
2131   // current "bias owner", if any, is actually invalid. Under these
2132   // circumstances _only_, we are allowed to use the current header's
2133   // value as the comparison value when doing the cas to acquire the
2134   // bias in the current epoch. In other words, we allow transfer of
2135   // the bias from one thread to another directly in this situation.
2136 
2137   // tmp_reg low (not owner) bits are (age: 0 | pattern&epoch: prototype^swap_reg)
2138 
2139   eor(tmp_reg, tmp_reg, swap_reg); // OK except for owner bits (age preserved !)
2140 
2141   // owner bits 'random'. Set them to Rthread.
2142 #ifdef AARCH64
2143   mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
2144   andr(tmp_reg, tmp_reg, tmp2);
2145 #else
2146   mov(tmp_reg, AsmOperand(tmp_reg, lsl, 23));
2147   mov(tmp_reg, AsmOperand(tmp_reg, lsr, 23));
2148 #endif // AARCH64
2149 
2150   orr(tmp_reg, tmp_reg, Rthread); // new mark
2151 
2152   biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, slow_case,
2153         (counters != NULL) ? counters->rebiased_lock_entry_count_addr() : NULL);
2154 
2155   // If the biasing toward our thread failed, then another thread
2156   // succeeded in biasing it toward itself and we need to revoke that
2157   // bias. The revocation will occur in the runtime in the slow case.
2158 
2159   b(done);
2160 
2161   bind(try_revoke_bias);
2162 
2163   // The prototype mark in the klass doesn't have the bias bit set any
2164   // more, indicating that objects of this data type are not supposed
2165   // to be biased any more. We are going to try to reset the mark of
2166   // this object to the prototype value and fall through to the
2167   // CAS-based locking scheme. Note that if our CAS fails, it means
2168   // that another thread raced us for the privilege of revoking the
2169   // bias of this particular object, so it's okay to continue in the
2170   // normal locking code.
2171 
2172   // tmp_reg low (not owner) bits are (age: 0 | pattern&epoch: prototype^swap_reg)
2173 
2174   eor(tmp_reg, tmp_reg, swap_reg); // OK except for owner bits (age preserved !)
2175 
2176   // owner bits 'random'. Clear them
2177 #ifdef AARCH64
2178   mov(tmp2, (markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place));
2179   andr(tmp_reg, tmp_reg, tmp2);
2180 #else
2181   mov(tmp_reg, AsmOperand(tmp_reg, lsl, 23));
2182   mov(tmp_reg, AsmOperand(tmp_reg, lsr, 23));
2183 #endif // AARCH64
2184 
2185   biased_locking_enter_with_cas(obj_reg, swap_reg, tmp_reg, tmp2, cas_label,
2186         (counters != NULL) ? counters->revoked_lock_entry_count_addr() : NULL);
2187 
2188   // Fall through to the normal CAS-based lock, because no matter what
2189   // the result of the above CAS, some thread must have succeeded in
2190   // removing the bias bit from the object's header.
2191 
2192   bind(cas_label);
2193 
2194   return null_check_offset;
2195 }
2196 
2197 
2198 void MacroAssembler::biased_locking_exit(Register obj_reg, Register tmp_reg, Label& done) {
2199   assert(UseBiasedLocking, "why call this otherwise?");
2200 
2201   // Check for biased locking unlock case, which is a no-op
2202   // Note: we do not have to check the thread ID for two reasons.
2203   // First, the interpreter checks for IllegalMonitorStateException at
2204   // a higher level. Second, if the bias was revoked while we held the
2205   // lock, the object could not be rebiased toward another thread, so
2206   // the bias bit would be clear.
2207   ldr(tmp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
2208 
2209   andr(tmp_reg, tmp_reg, (uintx)markOopDesc::biased_lock_mask_in_place);
2210   cmp(tmp_reg, markOopDesc::biased_lock_pattern);
2211   b(done, eq);
2212 }
2213 
2214 
2215 void MacroAssembler::resolve_jobject(Register value,
2216                                      Register tmp1,
2217                                      Register tmp2) {
2218   assert_different_registers(value, tmp1, tmp2);
2219   Label done, not_weak;
2220   cbz(value, done);             // Use NULL as-is.
2221   STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
2222   tbz(value, 0, not_weak);      // Test for jweak tag.
2223   // Resolve jweak.
2224   ldr(value, Address(value, -JNIHandles::weak_tag_value));
2225   verify_oop(value);
2226 #if INCLUDE_ALL_GCS
2227   if (UseG1GC) {
2228     g1_write_barrier_pre(noreg, // store_addr
2229                          noreg, // new_val
2230                          value, // pre_val
2231                          tmp1,  // tmp1
2232                          tmp2); // tmp2
2233     }
2234 #endif // INCLUDE_ALL_GCS
2235   b(done);
2236   bind(not_weak);
2237   // Resolve (untagged) jobject.
2238   ldr(value, Address(value));
2239   verify_oop(value);
2240   bind(done);
2241 }
2242 
2243 
2244 //////////////////////////////////////////////////////////////////////////////////
2245 
2246 #if INCLUDE_ALL_GCS
2247 
2248 // G1 pre-barrier.
2249 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
2250 // If store_addr != noreg, then previous value is loaded from [store_addr];
2251 // in such case store_addr and new_val registers are preserved;
2252 // otherwise pre_val register is preserved.
2253 void MacroAssembler::g1_write_barrier_pre(Register store_addr,
2254                                           Register new_val,
2255                                           Register pre_val,
2256                                           Register tmp1,
2257                                           Register tmp2) {
2258   Label done;
2259   Label runtime;
2260 
2261   if (store_addr != noreg) {
2262     assert_different_registers(store_addr, new_val, pre_val, tmp1, tmp2, noreg);
2263   } else {
2264     assert (new_val == noreg, "should be");
2265     assert_different_registers(pre_val, tmp1, tmp2, noreg);
2266   }
2267 
2268   Address in_progress(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
2269                                         SATBMarkQueue::byte_offset_of_active()));
2270   Address index(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
2271                                   SATBMarkQueue::byte_offset_of_index()));
2272   Address buffer(Rthread, in_bytes(JavaThread::satb_mark_queue_offset() +
2273                                    SATBMarkQueue::byte_offset_of_buf()));
2274 
2275   // Is marking active?
2276   assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "adjust this code");
2277   ldrb(tmp1, in_progress);
2278   cbz(tmp1, done);
2279 
2280   // Do we need to load the previous value?
2281   if (store_addr != noreg) {
2282     load_heap_oop(pre_val, Address(store_addr, 0));
2283   }
2284 
2285   // Is the previous value null?
2286   cbz(pre_val, done);
2287 
2288   // Can we store original value in the thread's buffer?
2289   // Is index == 0?
2290   // (The index field is typed as size_t.)
2291 
2292   ldr(tmp1, index);           // tmp1 := *index_adr
2293   ldr(tmp2, buffer);
2294 
2295   subs(tmp1, tmp1, wordSize); // tmp1 := tmp1 - wordSize
2296   b(runtime, lt);             // If negative, goto runtime
2297 
2298   str(tmp1, index);           // *index_adr := tmp1
2299 
2300   // Record the previous value
2301   str(pre_val, Address(tmp2, tmp1));
2302   b(done);
2303 
2304   bind(runtime);
2305 
2306   // save the live input values
2307 #ifdef AARCH64
2308   if (store_addr != noreg) {
2309     raw_push(store_addr, new_val);
2310   } else {
2311     raw_push(pre_val, ZR);
2312   }
2313 #else
2314   if (store_addr != noreg) {
2315     // avoid raw_push to support any ordering of store_addr and new_val
2316     push(RegisterSet(store_addr) | RegisterSet(new_val));
2317   } else {
2318     push(pre_val);
2319   }
2320 #endif // AARCH64
2321 
2322   if (pre_val != R0) {
2323     mov(R0, pre_val);
2324   }
2325   mov(R1, Rthread);
2326 
2327   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_pre), R0, R1);
2328 
2329 #ifdef AARCH64
2330   if (store_addr != noreg) {
2331     raw_pop(store_addr, new_val);
2332   } else {
2333     raw_pop(pre_val, ZR);
2334   }
2335 #else
2336   if (store_addr != noreg) {
2337     pop(RegisterSet(store_addr) | RegisterSet(new_val));
2338   } else {
2339     pop(pre_val);
2340   }
2341 #endif // AARCH64
2342 
2343   bind(done);
2344 }
2345 
2346 // G1 post-barrier.
2347 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
2348 void MacroAssembler::g1_write_barrier_post(Register store_addr,
2349                                            Register new_val,
2350                                            Register tmp1,
2351                                            Register tmp2,
2352                                            Register tmp3) {
2353 
2354   Address queue_index(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
2355                                         DirtyCardQueue::byte_offset_of_index()));
2356   Address buffer(Rthread, in_bytes(JavaThread::dirty_card_queue_offset() +
2357                                    DirtyCardQueue::byte_offset_of_buf()));
2358 
2359   BarrierSet* bs = Universe::heap()->barrier_set();
2360   CardTableModRefBS* ct = (CardTableModRefBS*)bs;
2361   Label done;
2362   Label runtime;
2363 
2364   // Does store cross heap regions?
2365 
2366   eor(tmp1, store_addr, new_val);
2367 #ifdef AARCH64
2368   logical_shift_right(tmp1, tmp1, HeapRegion::LogOfHRGrainBytes);
2369   cbz(tmp1, done);
2370 #else
2371   movs(tmp1, AsmOperand(tmp1, lsr, HeapRegion::LogOfHRGrainBytes));
2372   b(done, eq);
2373 #endif
2374 
2375   // crosses regions, storing NULL?
2376 
2377   cbz(new_val, done);
2378 
2379   // storing region crossing non-NULL, is card already dirty?
2380   const Register card_addr = tmp1;
2381   assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
2382 
2383   mov_address(tmp2, (address)ct->byte_map_base, symbolic_Relocation::card_table_reference);
2384   add(card_addr, tmp2, AsmOperand(store_addr, lsr, CardTableModRefBS::card_shift));
2385 
2386   ldrb(tmp2, Address(card_addr));
2387   cmp(tmp2, (int)G1SATBCardTableModRefBS::g1_young_card_val());
2388   b(done, eq);
2389 
2390   membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad), tmp2);
2391 
2392   assert(CardTableModRefBS::dirty_card_val() == 0, "adjust this code");
2393   ldrb(tmp2, Address(card_addr));
2394   cbz(tmp2, done);
2395 
2396   // storing a region crossing, non-NULL oop, card is clean.
2397   // dirty card and log.
2398 
2399   strb(zero_register(tmp2), Address(card_addr));
2400 
2401   ldr(tmp2, queue_index);
2402   ldr(tmp3, buffer);
2403 
2404   subs(tmp2, tmp2, wordSize);
2405   b(runtime, lt); // go to runtime if now negative
2406 
2407   str(tmp2, queue_index);
2408 
2409   str(card_addr, Address(tmp3, tmp2));
2410   b(done);
2411 
2412   bind(runtime);
2413 
2414   if (card_addr != R0) {
2415     mov(R0, card_addr);
2416   }
2417   mov(R1, Rthread);
2418   call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::g1_wb_post), R0, R1);
2419 
2420   bind(done);
2421 }
2422 
2423 #endif // INCLUDE_ALL_GCS
2424 
2425 //////////////////////////////////////////////////////////////////////////////////
2426 
2427 #ifdef AARCH64
2428 
2429 void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed) {
2430   switch (size_in_bytes) {
2431     case  8: ldr(dst, src); break;
2432     case  4: is_signed ? ldr_s32(dst, src) : ldr_u32(dst, src); break;
2433     case  2: is_signed ? ldrsh(dst, src) : ldrh(dst, src); break;
2434     case  1: is_signed ? ldrsb(dst, src) : ldrb(dst, src); break;
2435     default: ShouldNotReachHere();
2436   }
2437 }
2438 
2439 void MacroAssembler::store_sized_value(Register src, Address dst, size_t size_in_bytes) {
2440   switch (size_in_bytes) {
2441     case  8: str(src, dst);    break;
2442     case  4: str_32(src, dst); break;
2443     case  2: strh(src, dst);   break;
2444     case  1: strb(src, dst);   break;
2445     default: ShouldNotReachHere();
2446   }
2447 }
2448 
2449 #else
2450 
2451 void MacroAssembler::load_sized_value(Register dst, Address src,
2452                                     size_t size_in_bytes, bool is_signed, AsmCondition cond) {
2453   switch (size_in_bytes) {
2454     case  4: ldr(dst, src, cond); break;
2455     case  2: is_signed ? ldrsh(dst, src, cond) : ldrh(dst, src, cond); break;
2456     case  1: is_signed ? ldrsb(dst, src, cond) : ldrb(dst, src, cond); break;
2457     default: ShouldNotReachHere();
2458   }
2459 }
2460 
2461 
2462 void MacroAssembler::store_sized_value(Register src, Address dst, size_t size_in_bytes, AsmCondition cond) {
2463   switch (size_in_bytes) {
2464     case  4: str(src, dst, cond); break;
2465     case  2: strh(src, dst, cond);   break;
2466     case  1: strb(src, dst, cond);   break;
2467     default: ShouldNotReachHere();
2468   }
2469 }
2470 #endif // AARCH64
2471 
2472 // Look up the method for a megamorphic invokeinterface call.
2473 // The target method is determined by <Rinterf, Rindex>.
2474 // The receiver klass is in Rklass.
2475 // On success, the result will be in method_result, and execution falls through.
2476 // On failure, execution transfers to the given label.
2477 void MacroAssembler::lookup_interface_method(Register Rklass,
2478                                              Register Rinterf,
2479                                              Register Rindex,
2480                                              Register method_result,
2481                                              Register temp_reg1,
2482                                              Register temp_reg2,
2483                                              Label& L_no_such_interface) {
2484 
2485   assert_different_registers(Rklass, Rinterf, temp_reg1, temp_reg2, Rindex);
2486 
2487   Register Ritable = temp_reg1;
2488 
2489   // Compute start of first itableOffsetEntry (which is at the end of the vtable)
2490   const int base = in_bytes(Klass::vtable_start_offset());
2491   const int scale = exact_log2(vtableEntry::size_in_bytes());
2492   ldr_s32(temp_reg2, Address(Rklass, Klass::vtable_length_offset())); // Get length of vtable
2493   add(Ritable, Rklass, base);
2494   add(Ritable, Ritable, AsmOperand(temp_reg2, lsl, scale));
2495 
2496   Label entry, search;
2497 
2498   b(entry);
2499 
2500   bind(search);
2501   add(Ritable, Ritable, itableOffsetEntry::size() * HeapWordSize);
2502 
2503   bind(entry);
2504 
2505   // Check that the entry is non-null.  A null entry means that the receiver
2506   // class doesn't implement the interface, and wasn't the same as the
2507   // receiver class checked when the interface was resolved.
2508 
2509   ldr(temp_reg2, Address(Ritable, itableOffsetEntry::interface_offset_in_bytes()));
2510   cbz(temp_reg2, L_no_such_interface);
2511 
2512   cmp(Rinterf, temp_reg2);
2513   b(search, ne);
2514 
2515   ldr_s32(temp_reg2, Address(Ritable, itableOffsetEntry::offset_offset_in_bytes()));
2516   add(temp_reg2, temp_reg2, Rklass); // Add offset to Klass*
2517   assert(itableMethodEntry::size() * HeapWordSize == wordSize, "adjust the scaling in the code below");
2518   assert(itableMethodEntry::method_offset_in_bytes() == 0, "adjust the offset in the code below");
2519 
2520   ldr(method_result, Address::indexed_ptr(temp_reg2, Rindex));
2521 }
2522 
2523 #ifdef COMPILER2
2524 // TODO: 8 bytes at a time? pre-fetch?
2525 // Compare char[] arrays aligned to 4 bytes.
2526 void MacroAssembler::char_arrays_equals(Register ary1, Register ary2,
2527                                         Register limit, Register result,
2528                                       Register chr1, Register chr2, Label& Ldone) {
2529   Label Lvector, Lloop;
2530 
2531   // Note: limit contains number of bytes (2*char_elements) != 0.
2532   tst(limit, 0x2); // trailing character ?
2533   b(Lvector, eq);
2534 
2535   // compare the trailing char
2536   sub(limit, limit, sizeof(jchar));
2537   ldrh(chr1, Address(ary1, limit));
2538   ldrh(chr2, Address(ary2, limit));
2539   cmp(chr1, chr2);
2540   mov(result, 0, ne);     // not equal
2541   b(Ldone, ne);
2542 
2543   // only one char ?
2544   tst(limit, limit);
2545   mov(result, 1, eq);
2546   b(Ldone, eq);
2547 
2548   // word by word compare, dont't need alignment check
2549   bind(Lvector);
2550 
2551   // Shift ary1 and ary2 to the end of the arrays, negate limit
2552   add(ary1, limit, ary1);
2553   add(ary2, limit, ary2);
2554   neg(limit, limit);
2555 
2556   bind(Lloop);
2557   ldr_u32(chr1, Address(ary1, limit));
2558   ldr_u32(chr2, Address(ary2, limit));
2559   cmp_32(chr1, chr2);
2560   mov(result, 0, ne);     // not equal
2561   b(Ldone, ne);
2562   adds(limit, limit, 2*sizeof(jchar));
2563   b(Lloop, ne);
2564 
2565   // Caller should set it:
2566   // mov(result_reg, 1);  //equal
2567 }
2568 #endif
2569 
2570 void MacroAssembler::inc_counter(address counter_addr, Register tmpreg1, Register tmpreg2) {
2571   mov_slow(tmpreg1, counter_addr);
2572   ldr_s32(tmpreg2, tmpreg1);
2573   add_32(tmpreg2, tmpreg2, 1);
2574   str_32(tmpreg2, tmpreg1);
2575 }
2576 
2577 void MacroAssembler::floating_cmp(Register dst) {
2578 #ifdef AARCH64
2579   NOT_TESTED();
2580   cset(dst, gt);            // 1 if '>', else 0
2581   csinv(dst, dst, ZR, ge);  // previous value if '>=', else -1
2582 #else
2583   vmrs(dst, FPSCR);
2584   orr(dst, dst, 0x08000000);
2585   eor(dst, dst, AsmOperand(dst, lsl, 3));
2586   mov(dst, AsmOperand(dst, asr, 30));
2587 #endif
2588 }
2589 
2590 void MacroAssembler::restore_default_fp_mode() {
2591 #ifdef AARCH64
2592   msr(SysReg_FPCR, ZR);
2593 #else
2594 #ifndef __SOFTFP__
2595   // Round to Near mode, IEEE compatible, masked exceptions
2596   mov(Rtemp, 0);
2597   vmsr(FPSCR, Rtemp);
2598 #endif // !__SOFTFP__
2599 #endif // AARCH64
2600 }
2601 
2602 #ifndef AARCH64
2603 // 24-bit word range == 26-bit byte range
2604 bool check26(int offset) {
2605   // this could be simplified, but it mimics encoding and decoding
2606   // an actual branch insrtuction
2607   int off1 = offset << 6 >> 8;
2608   int encoded = off1 & ((1<<24)-1);
2609   int decoded = encoded << 8 >> 6;
2610   return offset == decoded;
2611 }
2612 #endif // !AARCH64
2613 
2614 // Perform some slight adjustments so the default 32MB code cache
2615 // is fully reachable.
2616 static inline address first_cache_address() {
2617   return CodeCache::low_bound() + sizeof(HeapBlock::Header);
2618 }
2619 static inline address last_cache_address() {
2620   return CodeCache::high_bound() - Assembler::InstructionSize;
2621 }
2622 
2623 #ifdef AARCH64
2624 // Can we reach target using ADRP?
2625 bool MacroAssembler::page_reachable_from_cache(address target) {
2626   intptr_t cl = (intptr_t)first_cache_address() & ~0xfff;
2627   intptr_t ch = (intptr_t)last_cache_address() & ~0xfff;
2628   intptr_t addr = (intptr_t)target & ~0xfff;
2629 
2630   intptr_t loffset = addr - cl;
2631   intptr_t hoffset = addr - ch;
2632   return is_imm_in_range(loffset >> 12, 21, 0) && is_imm_in_range(hoffset >> 12, 21, 0);
2633 }
2634 #endif
2635 
2636 // Can we reach target using unconditional branch or call from anywhere
2637 // in the code cache (because code can be relocated)?
2638 bool MacroAssembler::_reachable_from_cache(address target) {
2639 #ifdef __thumb__
2640   if ((1 & (intptr_t)target) != 0) {
2641     // Return false to avoid 'b' if we need switching to THUMB mode.
2642     return false;
2643   }
2644 #endif
2645 
2646   address cl = first_cache_address();
2647   address ch = last_cache_address();
2648 
2649   if (ForceUnreachable) {
2650     // Only addresses from CodeCache can be treated as reachable.
2651     if (target < CodeCache::low_bound() || CodeCache::high_bound() < target) {
2652       return false;
2653     }
2654   }
2655 
2656   intptr_t loffset = (intptr_t)target - (intptr_t)cl;
2657   intptr_t hoffset = (intptr_t)target - (intptr_t)ch;
2658 
2659 #ifdef AARCH64
2660   return is_offset_in_range(loffset, 26) && is_offset_in_range(hoffset, 26);
2661 #else
2662   return check26(loffset - 8) && check26(hoffset - 8);
2663 #endif
2664 }
2665 
2666 bool MacroAssembler::reachable_from_cache(address target) {
2667   assert(CodeCache::contains(pc()), "not supported");
2668   return _reachable_from_cache(target);
2669 }
2670 
2671 // Can we reach the entire code cache from anywhere else in the code cache?
2672 bool MacroAssembler::_cache_fully_reachable() {
2673   address cl = first_cache_address();
2674   address ch = last_cache_address();
2675   return _reachable_from_cache(cl) && _reachable_from_cache(ch);
2676 }
2677 
2678 bool MacroAssembler::cache_fully_reachable() {
2679   assert(CodeCache::contains(pc()), "not supported");
2680   return _cache_fully_reachable();
2681 }
2682 
2683 void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch NOT_AARCH64_ARG(AsmCondition cond)) {
2684   assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
2685   if (reachable_from_cache(target)) {
2686     relocate(rtype);
2687     b(target NOT_AARCH64_ARG(cond));
2688     return;
2689   }
2690 
2691   // Note: relocate is not needed for the code below,
2692   // encoding targets in absolute format.
2693   if (ignore_non_patchable_relocations()) {
2694     rtype = relocInfo::none;
2695   }
2696 
2697 #ifdef AARCH64
2698   assert (scratch != noreg, "should be specified");
2699   InlinedAddress address_literal(target, rtype);
2700   ldr_literal(scratch, address_literal);
2701   br(scratch);
2702   int off = offset();
2703   bind_literal(address_literal);
2704 #ifdef COMPILER2
2705   if (offset() - off == wordSize) {
2706     // no padding, so insert nop for worst-case sizing
2707     nop();
2708   }
2709 #endif
2710 #else
2711   if (VM_Version::supports_movw() && (scratch != noreg) && (rtype == relocInfo::none)) {
2712     // Note: this version cannot be (atomically) patched
2713     mov_slow(scratch, (intptr_t)target, cond);
2714     bx(scratch, cond);
2715   } else {
2716     Label skip;
2717     InlinedAddress address_literal(target);
2718     if (cond != al) {
2719       b(skip, inverse(cond));
2720     }
2721     relocate(rtype);
2722     ldr_literal(PC, address_literal);
2723     bind_literal(address_literal);
2724     bind(skip);
2725   }
2726 #endif // AARCH64
2727 }
2728 
2729 // Similar to jump except that:
2730 // - near calls are valid only if any destination in the cache is near
2731 // - no movt/movw (not atomically patchable)
2732 void MacroAssembler::patchable_jump(address target, relocInfo::relocType rtype, Register scratch NOT_AARCH64_ARG(AsmCondition cond)) {
2733   assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
2734   if (cache_fully_reachable()) {
2735     // Note: this assumes that all possible targets (the initial one
2736     // and the addressed patched to) are all in the code cache.
2737     assert(CodeCache::contains(target), "target might be too far");
2738     relocate(rtype);
2739     b(target NOT_AARCH64_ARG(cond));
2740     return;
2741   }
2742 
2743   // Discard the relocation information if not needed for CacheCompiledCode
2744   // since the next encodings are all in absolute format.
2745   if (ignore_non_patchable_relocations()) {
2746     rtype = relocInfo::none;
2747   }
2748 
2749 #ifdef AARCH64
2750   assert (scratch != noreg, "should be specified");
2751   InlinedAddress address_literal(target);
2752   relocate(rtype);
2753   ldr_literal(scratch, address_literal);
2754   br(scratch);
2755   int off = offset();
2756   bind_literal(address_literal);
2757 #ifdef COMPILER2
2758   if (offset() - off == wordSize) {
2759     // no padding, so insert nop for worst-case sizing
2760     nop();
2761   }
2762 #endif
2763 #else
2764   {
2765     Label skip;
2766     InlinedAddress address_literal(target);
2767     if (cond != al) {
2768       b(skip, inverse(cond));
2769     }
2770     relocate(rtype);
2771     ldr_literal(PC, address_literal);
2772     bind_literal(address_literal);
2773     bind(skip);
2774   }
2775 #endif // AARCH64
2776 }
2777 
2778 void MacroAssembler::call(address target, RelocationHolder rspec NOT_AARCH64_ARG(AsmCondition cond)) {
2779   Register scratch = LR;
2780   assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported");
2781   if (reachable_from_cache(target)) {
2782     relocate(rspec);
2783     bl(target NOT_AARCH64_ARG(cond));
2784     return;
2785   }
2786 
2787   // Note: relocate is not needed for the code below,
2788   // encoding targets in absolute format.
2789   if (ignore_non_patchable_relocations()) {
2790     // This assumes the information was needed only for relocating the code.
2791     rspec = RelocationHolder::none;
2792   }
2793 
2794 #ifndef AARCH64
2795   if (VM_Version::supports_movw() && (rspec.type() == relocInfo::none)) {
2796     // Note: this version cannot be (atomically) patched
2797     mov_slow(scratch, (intptr_t)target, cond);
2798     blx(scratch, cond);
2799     return;
2800   }
2801 #endif
2802 
2803   {
2804     Label ret_addr;
2805 #ifndef AARCH64
2806     if (cond != al) {
2807       b(ret_addr, inverse(cond));
2808     }
2809 #endif
2810 
2811 
2812 #ifdef AARCH64
2813     // TODO-AARCH64: make more optimal implementation
2814     // [ Keep in sync with MacroAssembler::call_size ]
2815     assert(rspec.type() == relocInfo::none, "call reloc not implemented");
2816     mov_slow(scratch, target);
2817     blr(scratch);
2818 #else
2819     InlinedAddress address_literal(target);
2820     relocate(rspec);
2821     adr(LR, ret_addr);
2822     ldr_literal(PC, address_literal);
2823 
2824     bind_literal(address_literal);
2825     bind(ret_addr);
2826 #endif
2827   }
2828 }
2829 
2830 #if defined(AARCH64) && defined(COMPILER2)
2831 int MacroAssembler::call_size(address target, bool far, bool patchable) {
2832   // FIXME: mov_slow is variable-length
2833   if (!far) return 1; // bl
2834   if (patchable) return 2;  // ldr; blr
2835   return instr_count_for_mov_slow((intptr_t)target) + 1;
2836 }
2837 #endif
2838 
2839 int MacroAssembler::patchable_call(address target, RelocationHolder const& rspec, bool c2) {
2840   assert(rspec.type() == relocInfo::static_call_type ||
2841          rspec.type() == relocInfo::none ||
2842          rspec.type() == relocInfo::opt_virtual_call_type, "not supported");
2843 
2844   // Always generate the relocation information, needed for patching
2845   relocate(rspec); // used by NativeCall::is_call_before()
2846   if (cache_fully_reachable()) {
2847     // Note: this assumes that all possible targets (the initial one
2848     // and the addresses patched to) are all in the code cache.
2849     assert(CodeCache::contains(target), "target might be too far");
2850     bl(target);
2851   } else {
2852 #if defined(AARCH64) && defined(COMPILER2)
2853     if (c2) {
2854       // return address needs to match call_size().
2855       // no need to trash Rtemp
2856       int off = offset();
2857       Label skip_literal;
2858       InlinedAddress address_literal(target);
2859       ldr_literal(LR, address_literal);
2860       blr(LR);
2861       int ret_addr_offset = offset();
2862       assert(offset() - off == call_size(target, true, true) * InstructionSize, "need to fix call_size()");
2863       b(skip_literal);
2864       int off2 = offset();
2865       bind_literal(address_literal);
2866       if (offset() - off2 == wordSize) {
2867         // no padding, so insert nop for worst-case sizing
2868         nop();
2869       }
2870       bind(skip_literal);
2871       return ret_addr_offset;
2872     }
2873 #endif
2874     Label ret_addr;
2875     InlinedAddress address_literal(target);
2876 #ifdef AARCH64
2877     ldr_literal(Rtemp, address_literal);
2878     adr(LR, ret_addr);
2879     br(Rtemp);
2880 #else
2881     adr(LR, ret_addr);
2882     ldr_literal(PC, address_literal);
2883 #endif
2884     bind_literal(address_literal);
2885     bind(ret_addr);
2886   }
2887   return offset();
2888 }
2889 
2890 // ((OopHandle)result).resolve();
2891 void MacroAssembler::resolve_oop_handle(Register result) {
2892   // OopHandle::resolve is an indirection.
2893   ldr(result, Address(result, 0));
2894 }
2895 
2896 void MacroAssembler::load_mirror(Register mirror, Register method, Register tmp) {
2897   const int mirror_offset = in_bytes(Klass::java_mirror_offset());
2898   ldr(tmp, Address(method, Method::const_offset()));
2899   ldr(tmp, Address(tmp,  ConstMethod::constants_offset()));
2900   ldr(tmp, Address(tmp, ConstantPool::pool_holder_offset_in_bytes()));
2901   ldr(mirror, Address(tmp, mirror_offset));
2902   resolve_oop_handle(mirror);
2903 }
2904 
2905 
2906 ///////////////////////////////////////////////////////////////////////////////
2907 
2908 // Compressed pointers
2909 
2910 #ifdef AARCH64
2911 
2912 void MacroAssembler::load_klass(Register dst_klass, Register src_oop) {
2913   if (UseCompressedClassPointers) {
2914     ldr_w(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()));
2915     decode_klass_not_null(dst_klass);
2916   } else {
2917     ldr(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()));
2918   }
2919 }
2920 
2921 #else
2922 
2923 void MacroAssembler::load_klass(Register dst_klass, Register src_oop, AsmCondition cond) {
2924   ldr(dst_klass, Address(src_oop, oopDesc::klass_offset_in_bytes()), cond);
2925 }
2926 
2927 #endif // AARCH64
2928 
2929 // Blows src_klass.
2930 void MacroAssembler::store_klass(Register src_klass, Register dst_oop) {
2931 #ifdef AARCH64
2932   if (UseCompressedClassPointers) {
2933     assert(src_klass != dst_oop, "not enough registers");
2934     encode_klass_not_null(src_klass);
2935     str_w(src_klass, Address(dst_oop, oopDesc::klass_offset_in_bytes()));
2936     return;
2937   }
2938 #endif // AARCH64
2939   str(src_klass, Address(dst_oop, oopDesc::klass_offset_in_bytes()));
2940 }
2941 
2942 #ifdef AARCH64
2943 
2944 void MacroAssembler::store_klass_gap(Register dst) {
2945   if (UseCompressedClassPointers) {
2946     str_w(ZR, Address(dst, oopDesc::klass_gap_offset_in_bytes()));
2947   }
2948 }
2949 
2950 #endif // AARCH64
2951 
2952 
2953 void MacroAssembler::load_heap_oop(Register dst, Address src) {
2954 #ifdef AARCH64
2955   if (UseCompressedOops) {
2956     ldr_w(dst, src);
2957     decode_heap_oop(dst);
2958     return;
2959   }
2960 #endif // AARCH64
2961   ldr(dst, src);
2962 }
2963 
2964 // Blows src and flags.
2965 void MacroAssembler::store_heap_oop(Register src, Address dst) {
2966 #ifdef AARCH64
2967   if (UseCompressedOops) {
2968     assert(!dst.uses(src), "not enough registers");
2969     encode_heap_oop(src);
2970     str_w(src, dst);
2971     return;
2972   }
2973 #endif // AARCH64
2974   str(src, dst);
2975 }
2976 
2977 void MacroAssembler::store_heap_oop_null(Register src, Address dst) {
2978 #ifdef AARCH64
2979   if (UseCompressedOops) {
2980     str_w(src, dst);
2981     return;
2982   }
2983 #endif // AARCH64
2984   str(src, dst);
2985 }
2986 
2987 
2988 #ifdef AARCH64
2989 
2990 // Algorithm must match oop.inline.hpp encode_heap_oop.
2991 void MacroAssembler::encode_heap_oop(Register dst, Register src) {
2992   // This code pattern is matched in NativeIntruction::skip_encode_heap_oop.
2993   // Update it at modifications.
2994   assert (UseCompressedOops, "must be compressed");
2995   assert (Universe::heap() != NULL, "java heap should be initialized");
2996 #ifdef ASSERT
2997   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
2998 #endif
2999   verify_oop(src);
3000   if (Universe::narrow_oop_base() == NULL) {
3001     if (Universe::narrow_oop_shift() != 0) {
3002       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3003       _lsr(dst, src, Universe::narrow_oop_shift());
3004     } else if (dst != src) {
3005       mov(dst, src);
3006     }
3007   } else {
3008     tst(src, src);
3009     csel(dst, Rheap_base, src, eq);
3010     sub(dst, dst, Rheap_base);
3011     if (Universe::narrow_oop_shift() != 0) {
3012       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3013       _lsr(dst, dst, Universe::narrow_oop_shift());
3014     }
3015   }
3016 }
3017 
3018 // Same algorithm as oop.inline.hpp decode_heap_oop.
3019 void MacroAssembler::decode_heap_oop(Register dst, Register src) {
3020 #ifdef ASSERT
3021   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3022 #endif
3023   assert(Universe::narrow_oop_shift() == 0 || LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3024   if (Universe::narrow_oop_base() != NULL) {
3025     tst(src, src);
3026     add(dst, Rheap_base, AsmOperand(src, lsl, Universe::narrow_oop_shift()));
3027     csel(dst, dst, ZR, ne);
3028   } else {
3029     _lsl(dst, src, Universe::narrow_oop_shift());
3030   }
3031   verify_oop(dst);
3032 }
3033 
3034 #ifdef COMPILER2
3035 // Algorithm must match oop.inline.hpp encode_heap_oop.
3036 // Must preserve condition codes, or C2 encodeHeapOop_not_null rule
3037 // must be changed.
3038 void MacroAssembler::encode_heap_oop_not_null(Register dst, Register src) {
3039   assert (UseCompressedOops, "must be compressed");
3040   assert (Universe::heap() != NULL, "java heap should be initialized");
3041 #ifdef ASSERT
3042   verify_heapbase("MacroAssembler::encode_heap_oop: heap base corrupted?");
3043 #endif
3044   verify_oop(src);
3045   if (Universe::narrow_oop_base() == NULL) {
3046     if (Universe::narrow_oop_shift() != 0) {
3047       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3048       _lsr(dst, src, Universe::narrow_oop_shift());
3049     } else if (dst != src) {
3050           mov(dst, src);
3051     }
3052   } else {
3053     sub(dst, src, Rheap_base);
3054     if (Universe::narrow_oop_shift() != 0) {
3055       assert (LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3056       _lsr(dst, dst, Universe::narrow_oop_shift());
3057     }
3058   }
3059 }
3060 
3061 // Same algorithm as oops.inline.hpp decode_heap_oop.
3062 // Must preserve condition codes, or C2 decodeHeapOop_not_null rule
3063 // must be changed.
3064 void MacroAssembler::decode_heap_oop_not_null(Register dst, Register src) {
3065 #ifdef ASSERT
3066   verify_heapbase("MacroAssembler::decode_heap_oop: heap base corrupted?");
3067 #endif
3068   assert(Universe::narrow_oop_shift() == 0 || LogMinObjAlignmentInBytes == Universe::narrow_oop_shift(), "decode alg wrong");
3069   if (Universe::narrow_oop_base() != NULL) {
3070     add(dst, Rheap_base, AsmOperand(src, lsl, Universe::narrow_oop_shift()));
3071   } else {
3072     _lsl(dst, src, Universe::narrow_oop_shift());
3073   }
3074   verify_oop(dst);
3075 }
3076 
3077 void MacroAssembler::set_narrow_klass(Register dst, Klass* k) {
3078   assert(UseCompressedClassPointers, "should only be used for compressed header");
3079   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3080   int klass_index = oop_recorder()->find_index(k);
3081   RelocationHolder rspec = metadata_Relocation::spec(klass_index);
3082 
3083   // Relocation with special format (see relocInfo_arm.hpp).
3084   relocate(rspec);
3085   narrowKlass encoded_k = Klass::encode_klass(k);
3086   movz(dst, encoded_k & 0xffff, 0);
3087   movk(dst, (encoded_k >> 16) & 0xffff, 16);
3088 }
3089 
3090 void MacroAssembler::set_narrow_oop(Register dst, jobject obj) {
3091   assert(UseCompressedOops, "should only be used for compressed header");
3092   assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
3093   int oop_index = oop_recorder()->find_index(obj);
3094   RelocationHolder rspec = oop_Relocation::spec(oop_index);
3095 
3096   relocate(rspec);
3097   movz(dst, 0xffff, 0);
3098   movk(dst, 0xffff, 16);
3099 }
3100 
3101 #endif // COMPILER2
3102 
3103 // Must preserve condition codes, or C2 encodeKlass_not_null rule
3104 // must be changed.
3105 void MacroAssembler::encode_klass_not_null(Register r) {
3106   if (Universe::narrow_klass_base() != NULL) {
3107     // Use Rheap_base as a scratch register in which to temporarily load the narrow_klass_base.
3108     assert(r != Rheap_base, "Encoding a klass in Rheap_base");
3109     mov_slow(Rheap_base, Universe::narrow_klass_base());
3110     sub(r, r, Rheap_base);
3111   }
3112   if (Universe::narrow_klass_shift() != 0) {
3113     assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3114     _lsr(r, r, Universe::narrow_klass_shift());
3115   }
3116   if (Universe::narrow_klass_base() != NULL) {
3117     reinit_heapbase();
3118   }
3119 }
3120 
3121 // Must preserve condition codes, or C2 encodeKlass_not_null rule
3122 // must be changed.
3123 void MacroAssembler::encode_klass_not_null(Register dst, Register src) {
3124   if (dst == src) {
3125     encode_klass_not_null(src);
3126     return;
3127   }
3128   if (Universe::narrow_klass_base() != NULL) {
3129     mov_slow(dst, (int64_t)Universe::narrow_klass_base());
3130     sub(dst, src, dst);
3131     if (Universe::narrow_klass_shift() != 0) {
3132       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3133       _lsr(dst, dst, Universe::narrow_klass_shift());
3134     }
3135   } else {
3136     if (Universe::narrow_klass_shift() != 0) {
3137       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3138       _lsr(dst, src, Universe::narrow_klass_shift());
3139     } else {
3140       mov(dst, src);
3141     }
3142   }
3143 }
3144 
3145 // Function instr_count_for_decode_klass_not_null() counts the instructions
3146 // generated by decode_klass_not_null(register r) and reinit_heapbase(),
3147 // when (Universe::heap() != NULL).  Hence, if the instructions they
3148 // generate change, then this method needs to be updated.
3149 int MacroAssembler::instr_count_for_decode_klass_not_null() {
3150   assert(UseCompressedClassPointers, "only for compressed klass ptrs");
3151   assert(Universe::heap() != NULL, "java heap should be initialized");
3152   if (Universe::narrow_klass_base() != NULL) {
3153     return instr_count_for_mov_slow(Universe::narrow_klass_base()) + // mov_slow
3154       1 +                                                                 // add
3155       instr_count_for_mov_slow(Universe::narrow_ptrs_base());   // reinit_heapbase() = mov_slow
3156   } else {
3157     if (Universe::narrow_klass_shift() != 0) {
3158       return 1;
3159     }
3160   }
3161   return 0;
3162 }
3163 
3164 // Must preserve condition codes, or C2 decodeKlass_not_null rule
3165 // must be changed.
3166 void MacroAssembler::decode_klass_not_null(Register r) {
3167   int off = offset();
3168   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3169   assert(Universe::heap() != NULL, "java heap should be initialized");
3170   assert(r != Rheap_base, "Decoding a klass in Rheap_base");
3171   // Cannot assert, instr_count_for_decode_klass_not_null() counts instructions.
3172   // Also do not verify_oop as this is called by verify_oop.
3173   if (Universe::narrow_klass_base() != NULL) {
3174     // Use Rheap_base as a scratch register in which to temporarily load the narrow_klass_base.
3175     mov_slow(Rheap_base, Universe::narrow_klass_base());
3176     add(r, Rheap_base, AsmOperand(r, lsl, Universe::narrow_klass_shift()));
3177     reinit_heapbase();
3178   } else {
3179     if (Universe::narrow_klass_shift() != 0) {
3180       assert (LogKlassAlignmentInBytes == Universe::narrow_klass_shift(), "decode alg wrong");
3181       _lsl(r, r, Universe::narrow_klass_shift());
3182     }
3183   }
3184   assert((offset() - off) == (instr_count_for_decode_klass_not_null() * InstructionSize), "need to fix instr_count_for_decode_klass_not_null");
3185 }
3186 
3187 // Must preserve condition codes, or C2 decodeKlass_not_null rule
3188 // must be changed.
3189 void MacroAssembler::decode_klass_not_null(Register dst, Register src) {
3190   if (src == dst) {
3191     decode_klass_not_null(src);
3192     return;
3193   }
3194 
3195   assert(UseCompressedClassPointers, "should only be used for compressed headers");
3196   assert(Universe::heap() != NULL, "java heap should be initialized");
3197   assert(src != Rheap_base, "Decoding a klass in Rheap_base");
3198   assert(dst != Rheap_base, "Decoding a klass into Rheap_base");
3199   // Also do not verify_oop as this is called by verify_oop.
3200   if (Universe::narrow_klass_base() != NULL) {
3201     mov_slow(dst, Universe::narrow_klass_base());
3202     add(dst, dst, AsmOperand(src, lsl, Universe::narrow_klass_shift()));
3203   } else {
3204     _lsl(dst, src, Universe::narrow_klass_shift());
3205   }
3206 }
3207 
3208 
3209 void MacroAssembler::reinit_heapbase() {
3210   if (UseCompressedOops || UseCompressedClassPointers) {
3211     if (Universe::heap() != NULL) {
3212       mov_slow(Rheap_base, Universe::narrow_ptrs_base());
3213     } else {
3214       ldr_global_ptr(Rheap_base, (address)Universe::narrow_ptrs_base_addr());
3215     }
3216   }
3217 }
3218 
3219 #ifdef ASSERT
3220 void MacroAssembler::verify_heapbase(const char* msg) {
3221   // This code pattern is matched in NativeIntruction::skip_verify_heapbase.
3222   // Update it at modifications.
3223   assert (UseCompressedOops, "should be compressed");
3224   assert (Universe::heap() != NULL, "java heap should be initialized");
3225   if (CheckCompressedOops) {
3226     Label ok;
3227     str(Rthread, Address(Rthread, in_bytes(JavaThread::in_top_frame_unsafe_section_offset())));
3228     raw_push(Rtemp, ZR);
3229     mrs(Rtemp, Assembler::SysReg_NZCV);
3230     str(Rtemp, Address(SP, 1 * wordSize));
3231     mov_slow(Rtemp, Universe::narrow_ptrs_base());
3232     cmp(Rheap_base, Rtemp);
3233     b(ok, eq);
3234     stop(msg);
3235     bind(ok);
3236     ldr(Rtemp, Address(SP, 1 * wordSize));
3237     msr(Assembler::SysReg_NZCV, Rtemp);
3238     raw_pop(Rtemp, ZR);
3239     str(ZR, Address(Rthread, in_bytes(JavaThread::in_top_frame_unsafe_section_offset())));
3240   }
3241 }
3242 #endif // ASSERT
3243 
3244 #endif // AARCH64
3245 
3246 #ifdef COMPILER2
3247 void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2 AARCH64_ONLY_ARG(Register Rscratch3))
3248 {
3249   assert(VM_Version::supports_ldrex(), "unsupported, yet?");
3250 
3251   Register Rmark      = Rscratch2;
3252 
3253   assert(Roop != Rscratch, "");
3254   assert(Roop != Rmark, "");
3255   assert(Rbox != Rscratch, "");
3256   assert(Rbox != Rmark, "");
3257 
3258   Label fast_lock, done;
3259 
3260   if (UseBiasedLocking && !UseOptoBiasInlining) {
3261     Label failed;
3262 #ifdef AARCH64
3263     biased_locking_enter(Roop, Rmark, Rscratch, false, Rscratch3, done, failed);
3264 #else
3265     biased_locking_enter(Roop, Rmark, Rscratch, false, noreg, done, failed);
3266 #endif
3267     bind(failed);
3268   }
3269 
3270   ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes()));
3271   tst(Rmark, markOopDesc::unlocked_value);
3272   b(fast_lock, ne);
3273 
3274   // Check for recursive lock
3275   // See comments in InterpreterMacroAssembler::lock_object for
3276   // explanations on the fast recursive locking check.
3277 #ifdef AARCH64
3278   intptr_t mask = ((intptr_t)3) - ((intptr_t)os::vm_page_size());
3279   Assembler::LogicalImmediate imm(mask, false);
3280   mov(Rscratch, SP);
3281   sub(Rscratch, Rmark, Rscratch);
3282   ands(Rscratch, Rscratch, imm);
3283   b(done, ne); // exit with failure
3284   str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes())); // set to zero
3285   b(done);
3286 
3287 #else
3288   // -1- test low 2 bits
3289   movs(Rscratch, AsmOperand(Rmark, lsl, 30));
3290   // -2- test (hdr - SP) if the low two bits are 0
3291   sub(Rscratch, Rmark, SP, eq);
3292   movs(Rscratch, AsmOperand(Rscratch, lsr, exact_log2(os::vm_page_size())), eq);
3293   // If still 'eq' then recursive locking OK
3294   str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()), eq); // set to zero
3295   b(done);
3296 #endif
3297 
3298   bind(fast_lock);
3299   str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
3300 
3301   bool allow_fallthrough_on_failure = true;
3302   bool one_shot = true;
3303   cas_for_lock_acquire(Rmark, Rbox, Roop, Rscratch, done, allow_fallthrough_on_failure, one_shot);
3304 
3305   bind(done);
3306 
3307 }
3308 
3309 void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2  AARCH64_ONLY_ARG(Register Rscratch3))
3310 {
3311   assert(VM_Version::supports_ldrex(), "unsupported, yet?");
3312 
3313   Register Rmark      = Rscratch2;
3314 
3315   assert(Roop != Rscratch, "");
3316   assert(Roop != Rmark, "");
3317   assert(Rbox != Rscratch, "");
3318   assert(Rbox != Rmark, "");
3319 
3320   Label done;
3321 
3322   if (UseBiasedLocking && !UseOptoBiasInlining) {
3323     biased_locking_exit(Roop, Rscratch, done);
3324   }
3325 
3326   ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
3327   // If hdr is NULL, we've got recursive locking and there's nothing more to do
3328   cmp(Rmark, 0);
3329   b(done, eq);
3330 
3331   // Restore the object header
3332   bool allow_fallthrough_on_failure = true;
3333   bool one_shot = true;
3334   cas_for_lock_release(Rmark, Rbox, Roop, Rscratch, done, allow_fallthrough_on_failure, one_shot);
3335 
3336   bind(done);
3337 
3338 }
3339 #endif // COMPILER2
3340