--- old/make/autoconf/build-aux/config.sub	2018-09-25 19:23:21.000000000 +0300
+++ new/make/autoconf/build-aux/config.sub	2018-09-25 19:23:21.000000000 +0300
@@ -45,6 +45,11 @@
             config=`echo $1 | sed 's/^aarch64-/arm-/'`
             sub_args="$sub_args $config"
             shift; ;;
+        aarch32-* )
+            config=`echo $1 | sed 's/^aarch32-/arm-/'`
+            sub_args="$sub_args $config"
+            replace="aarch32-"
+            shift; ;;
         - )    # Use stdin as input.
             sub_args="$sub_args $1"
             shift; break ;;
--- old/make/autoconf/flags.m4	2018-09-25 19:23:23.000000000 +0300
+++ new/make/autoconf/flags.m4	2018-09-25 19:23:22.000000000 +0300
@@ -38,8 +38,9 @@
 
   if test "x$with_abi_profile" != x; then
     if test "x$OPENJDK_TARGET_CPU" != xarm && \
-        test "x$OPENJDK_TARGET_CPU" != xaarch64; then
-      AC_MSG_ERROR([--with-abi-profile only available on arm/aarch64])
+       test "x$OPENJDK_TARGET_CPU" != xaarch64 && \
+       test "x$OPENJDK_TARGET_CPU" != xaarch32 ; then
+      AC_MSG_ERROR([--with-abi-profile only available on arm/aarch64/aarch32])
     fi
 
     OPENJDK_TARGET_ABI_PROFILE=$with_abi_profile
@@ -65,6 +66,14 @@
       # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
       ARM_FLOAT_TYPE=
       ARM_ARCH_TYPE_FLAGS=
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xgnueabihf; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
+    elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xgnueabi; then
+      # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
+      ARM_FLOAT_TYPE=
+      ARM_ARCH_TYPE_FLAGS=
     elif test "x$OPENJDK_TARGET_ABI_PROFILE" = xaarch64; then
       # No special flags, just need to trigger setting JDK_ARCH_ABI_PROP_NAME
       ARM_FLOAT_TYPE=
--- old/make/autoconf/hotspot.m4	2018-09-25 19:23:24.000000000 +0300
+++ new/make/autoconf/hotspot.m4	2018-09-25 19:23:24.000000000 +0300
@@ -305,8 +305,14 @@
 
   # Override hotspot cpu definitions for ARM platforms
   if test "x$OPENJDK_TARGET_CPU" = xarm; then
-    HOTSPOT_TARGET_CPU=arm_32
-    HOTSPOT_TARGET_CPU_DEFINE="ARM32"
+    if test "x$HOTSPOT_TARGET_CPU_PORT" = xarm; then
+      HOTSPOT_TARGET_CPU=arm_32
+      HOTSPOT_TARGET_CPU_DEFINE="ARM32"
+    else
+      HOTSPOT_TARGET_CPU=aarch32
+      HOTSPOT_TARGET_CPU_ARCH=aarch32
+      HOTSPOT_TARGET_CPU_DEFINE="AARCH32"
+    fi
   elif test "x$OPENJDK_TARGET_CPU" = xaarch64 && test "x$HOTSPOT_TARGET_CPU_PORT" = xarm64; then
     HOTSPOT_TARGET_CPU=arm_64
     HOTSPOT_TARGET_CPU_ARCH=arm
@@ -558,23 +564,27 @@
 
 ################################################################################
 #
-# Specify which sources will be used to build the 64-bit ARM port
+# Specify which sources will be used to build the ARM port
 #
-# --with-cpu-port=arm64   will use hotspot/src/cpu/arm
+# --with-cpu-port=arm     will use hotspot/src/cpu/arm
 # --with-cpu-port=aarch64 will use hotspot/src/cpu/aarch64
+# --with-cpu-port=aarch32 will use hotspot/src/cpu/aarch32
 #
 AC_DEFUN([SETUP_HOTSPOT_TARGET_CPU_PORT],
 [
   AC_ARG_WITH(cpu-port, [AS_HELP_STRING([--with-cpu-port],
-      [specify sources to use for Hotspot 64-bit ARM port (arm64,aarch64) @<:@aarch64@:>@ ])])
+      [specify sources to use for Hotspot ARM port (arm,aarch64,aarch32) @<:@aarch64@:>@ ])])
 
   if test "x$with_cpu_port" != x; then
-    if test "x$OPENJDK_TARGET_CPU" != xaarch64; then
-      AC_MSG_ERROR([--with-cpu-port only available on aarch64])
-    fi
-    if test "x$with_cpu_port" != xarm64 && \
-        test "x$with_cpu_port" != xaarch64; then
-      AC_MSG_ERROR([--with-cpu-port must specify arm64 or aarch64])
+    if test "x$OPENJDK_TARGET_CPU" != xaarch64 && \
+       test "x$OPENJDK_TARGET_CPU" != xaarch32 && \
+       test "x$OPENJDK_TARGET_CPU" != xarm ; then
+      AC_MSG_ERROR([--with-cpu-port only available on arm/aarch64/32])
+    fi
+    if test "x$with_cpu_port" != xarm && \
+        test "x$with_cpu_port" != xaarch64 && \
+	test "x$with_cpu_port" != xaarch32 ; then
+      AC_MSG_ERROR([--with-cpu-port must specify arm, aarch32 or aarch64])
     fi
     HOTSPOT_TARGET_CPU_PORT="$with_cpu_port"
   fi
--- old/make/autoconf/platform.m4	2018-09-25 19:23:25.000000000 +0300
+++ new/make/autoconf/platform.m4	2018-09-25 19:23:25.000000000 +0300
@@ -28,6 +28,7 @@
 # VAR_CPU, VAR_CPU_ARCH, VAR_CPU_BITS and VAR_CPU_ENDIAN.
 AC_DEFUN([PLATFORM_EXTRACT_VARS_FROM_CPU],
 [
+  echo "LOOKING UP CPU ARCH $1"
   # First argument is the cpu name from the trip/quad
   case "$1" in
     x86_64)
@@ -54,6 +55,12 @@
       VAR_CPU_BITS=32
       VAR_CPU_ENDIAN=little
       ;;
+    aarch32)
+      VAR_CPU=aarch32
+      VAR_CPU_ARCH=aarch32
+      VAR_CPU_BITS=32
+      VAR_CPU_ENDIAN=little
+      ;;
     aarch64)
       VAR_CPU=aarch64
       VAR_CPU_ARCH=aarch64
@@ -386,6 +393,8 @@
   elif test "x$OPENJDK_$1_OS" != xmacosx && test "x$OPENJDK_$1_CPU" = xx86_64; then
     # On all platforms except macosx, we replace x86_64 with amd64.
     OPENJDK_$1_CPU_OSARCH="amd64"
+  elif test "x$OPENJDK_$1_CPU" = xaarch32; then
+    OPENJDK_$1_CPU_OSARCH="arm"
   fi
   AC_SUBST(OPENJDK_$1_CPU_OSARCH)
 
--- old/make/hotspot/lib/JvmOverrideFiles.gmk	2018-09-25 19:23:26.000000000 +0300
+++ new/make/hotspot/lib/JvmOverrideFiles.gmk	2018-09-25 19:23:26.000000000 +0300
@@ -38,6 +38,10 @@
   BUILD_LIBJVM_interp_masm_x86.cpp_CXXFLAGS := -Wno-uninitialized
 endif
 
+ifeq ($(TOOLCHAIN_TYPE), gcc)
+  BUILD_LIBJVM_vm_version_aarch32_2.cpp_CXXFLAGS := -fno-stack-protector
+endif
+
 ifeq ($(OPENJDK_TARGET_OS), linux)
   BUILD_LIBJVM_ostream.cpp_CXXFLAGS := -D_FILE_OFFSET_BITS=64
   BUILD_LIBJVM_logFileOutput.cpp_CXXFLAGS := -D_FILE_OFFSET_BITS=64
--- old/make/lib/Lib-jdk.hotspot.agent.gmk	2018-09-25 19:23:27.000000000 +0300
+++ new/make/lib/Lib-jdk.hotspot.agent.gmk	2018-09-25 19:23:27.000000000 +0300
@@ -67,6 +67,8 @@
     LIBS_windows := dbgeng.lib, \
 ))
 
+ifneq ($(OPENJDK_TARGET_CPU), aarch32)
 TARGETS += $(BUILD_LIBSA)
+endif
 
 ################################################################################
--- old/src/hotspot/os/linux/os_linux.cpp	2018-09-25 19:23:28.000000000 +0300
+++ new/src/hotspot/os/linux/os_linux.cpp	2018-09-25 19:23:28.000000000 +0300
@@ -1773,6 +1773,8 @@
   static  Elf32_Half running_arch_code=EM_AARCH64;
 #elif  (defined ARM)
   static  Elf32_Half running_arch_code=EM_ARM;
+#elif  (defined AARCH32)
+  static  Elf32_Half running_arch_code=EM_ARM;
 #elif  (defined S390)
   static  Elf32_Half running_arch_code=EM_S390;
 #elif  (defined ALPHA)
@@ -3516,6 +3518,7 @@
     AARCH64_ONLY(2 * M)
     AMD64_ONLY(2 * M)
     ARM32_ONLY(2 * M)
+    AARCH32_ONLY(2 * M)
     IA32_ONLY(4 * M)
     IA64_ONLY(256 * M)
     PPC_ONLY(4 * M)
--- old/src/hotspot/share/adlc/adlparse.cpp	2018-09-25 19:23:30.000000000 +0300
+++ new/src/hotspot/share/adlc/adlparse.cpp	2018-09-25 19:23:29.000000000 +0300
@@ -483,7 +483,8 @@
     else if (!strcmp(ident, "format"))    oper->_format    = format_parse();
     else if (!strcmp(ident, "interface")) oper->_interface = interface_parse();
     // Check identifier to see if it is the name of an attribute
-    else if (((attr = _globalNames[ident]->is_attribute()) != NULL) &&
+    else if (_globalNames[ident] &&
+             ((attr = _globalNames[ident]->is_attribute()) != NULL) &&
              (attr->_atype == OP_ATTR))   oper->_attribs   = attr_parse(ident);
     else {
       parse_err(SYNERR, "expected one of - constraint, predicate, match, encode, format, construct, or the name of a defined operand attribute at %s\n", ident);
--- old/src/hotspot/share/c1/c1_Compiler.cpp	2018-09-25 19:23:31.000000000 +0300
+++ new/src/hotspot/share/c1/c1_Compiler.cpp	2018-09-25 19:23:31.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -215,10 +216,19 @@
   case vmIntrinsics::_updateCRC32:
   case vmIntrinsics::_updateBytesCRC32:
   case vmIntrinsics::_updateByteBufferCRC32:
-#if defined(SPARC) || defined(S390) || defined(PPC64) || defined(AARCH64)
+#if defined(SPARC) || defined(S390) || defined(PPC64) || defined(AARCH64) || defined(AARCH32)
   case vmIntrinsics::_updateBytesCRC32C:
   case vmIntrinsics::_updateDirectByteBufferCRC32C:
 #endif
+#ifdef AARCH32
+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+  case vmIntrinsics::_sha_implCompress:
+  case vmIntrinsics::_sha2_implCompress:
+  case vmIntrinsics::_sha5_implCompress:
+  case vmIntrinsics::_montgomeryMultiply:
+  case vmIntrinsics::_montgomerySquare:
+#endif
   case vmIntrinsics::_vectorizedMismatch:
   case vmIntrinsics::_compareAndSetInt:
   case vmIntrinsics::_compareAndSetObject:
--- old/src/hotspot/share/c1/c1_LIR.cpp	2018-09-25 19:23:32.000000000 +0300
+++ new/src/hotspot/share/c1/c1_LIR.cpp	2018-09-25 19:23:32.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -145,6 +146,7 @@
     case T_FLOAT:
       // FP return values can be also in CPU registers on ARM and PPC32 (softfp ABI)
       assert((kindfield == fpu_register || kindfield == stack_value
+             AARCH32_ONLY(|| kindfield == cpu_register)
              ARM_ONLY(|| kindfield == cpu_register)
              PPC32_ONLY(|| kindfield == cpu_register) ) &&
              size_field() == single_size, "must match");
@@ -1492,7 +1494,7 @@
     out->print("fpu%d", fpu_regnr());
   } else if (is_double_fpu()) {
     out->print("fpu%d", fpu_regnrLo());
-#elif defined(ARM)
+#elif defined(ARM) || defined(AARCH32)
   } else if (is_single_fpu()) {
     out->print("s%d", fpu_regnr());
   } else if (is_double_fpu()) {
--- old/src/hotspot/share/c1/c1_LIR.hpp	2018-09-25 19:23:33.000000000 +0300
+++ new/src/hotspot/share/c1/c1_LIR.hpp	2018-09-25 19:23:33.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -693,18 +694,40 @@
 
 #ifdef __SOFTFP__
       case T_FLOAT:
-        res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
-                                  LIR_OprDesc::float_type  |
-                                  LIR_OprDesc::cpu_register |
-                                  LIR_OprDesc::single_size |
-                                  LIR_OprDesc::virtual_mask);
+#ifdef AARCH32
+        if (hasFPU()) {
+            res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
+                                      LIR_OprDesc::float_type           |
+                                      LIR_OprDesc::fpu_register         |
+                                      LIR_OprDesc::single_size          |
+                                      LIR_OprDesc::virtual_mask);
+        } else
+#endif // AARCH32
+        {
+          res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
+                                    LIR_OprDesc::float_type  |
+                                    LIR_OprDesc::cpu_register |
+                                    LIR_OprDesc::single_size |
+                                    LIR_OprDesc::virtual_mask);
+        }
         break;
       case T_DOUBLE:
-        res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
-                                  LIR_OprDesc::double_type |
-                                  LIR_OprDesc::cpu_register |
-                                  LIR_OprDesc::double_size |
-                                  LIR_OprDesc::virtual_mask);
+#ifdef AARCH32
+        if(hasFPU()) {
+            res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
+                                                LIR_OprDesc::double_type           |
+                                                LIR_OprDesc::fpu_register          |
+                                                LIR_OprDesc::double_size           |
+                                                LIR_OprDesc::virtual_mask);
+        } else
+#endif
+        {
+          res = (LIR_Opr)(intptr_t)((index << LIR_OprDesc::data_shift) |
+                                    LIR_OprDesc::double_type |
+                                    LIR_OprDesc::cpu_register |
+                                    LIR_OprDesc::double_size |
+                                    LIR_OprDesc::virtual_mask);
+        }
         break;
 #else // __SOFTFP__
       case T_FLOAT:
--- old/src/hotspot/share/c1/c1_LIRGenerator.cpp	2018-09-25 19:23:35.000000000 +0300
+++ new/src/hotspot/share/c1/c1_LIRGenerator.cpp	2018-09-25 19:23:34.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -3068,6 +3069,29 @@
     do_update_CRC32C(x);
     break;
 
+#ifdef AARCH32
+  case vmIntrinsics::_aescrypt_encryptBlock:
+  case vmIntrinsics::_aescrypt_decryptBlock:
+    do_aescrypt_block(x);
+    break;
+
+  case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+  case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    do_aescrypt_cbc(x);
+    break;
+
+  case vmIntrinsics::_sha_implCompress:
+  case vmIntrinsics::_sha2_implCompress:
+  case vmIntrinsics::_sha5_implCompress:
+    do_sha(x);
+    break;
+
+  case vmIntrinsics::_montgomeryMultiply:
+  case vmIntrinsics::_montgomerySquare:
+    do_montgomery_intrinsic(x);
+    break;
+#endif
+
   case vmIntrinsics::_vectorizedMismatch:
     do_vectorizedMismatch(x);
     break;
--- old/src/hotspot/share/c1/c1_LIRGenerator.hpp	2018-09-25 19:23:36.000000000 +0300
+++ new/src/hotspot/share/c1/c1_LIRGenerator.hpp	2018-09-25 19:23:36.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -264,7 +265,16 @@
   void do_Reference_get(Intrinsic* x);
   void do_update_CRC32(Intrinsic* x);
   void do_update_CRC32C(Intrinsic* x);
+#ifdef AARCH32
+  void do_update_CRC32_inner(Intrinsic* x, int is_crc32c);
+#endif
   void do_vectorizedMismatch(Intrinsic* x);
+#ifdef AARCH32
+  void do_aescrypt_block(Intrinsic* x);
+  void do_aescrypt_cbc(Intrinsic* x);
+  void do_sha(Intrinsic* x);
+  void do_montgomery_intrinsic(Intrinsic *x);
+#endif
 
  public:
   LIR_Opr call_runtime(BasicTypeArray* signature, LIRItemList* args, address entry, ValueType* result_type, CodeEmitInfo* info);
@@ -311,6 +321,9 @@
   void array_store_check(LIR_Opr value, LIR_Opr array, CodeEmitInfo* store_check_info, ciMethod* profiled_method, int profiled_bci);
 
   static LIR_Opr result_register_for(ValueType* type, bool callee = false);
+#ifdef AARCH32
+  static LIR_Opr java_result_register_for(ValueType* type, bool callee = false);
+#endif
 
   ciObject* get_jobject_constant(Value value);
 
--- old/src/hotspot/share/c1/c1_LinearScan.cpp	2018-09-25 19:23:37.000000000 +0300
+++ new/src/hotspot/share/c1/c1_LinearScan.cpp	2018-09-25 19:23:37.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -179,10 +180,10 @@
 }
 
 bool LinearScan::is_virtual_cpu_interval(const Interval* i) {
-#if defined(__SOFTFP__) || defined(E500V2)
+#if !defined(AARCH32) && (defined(__SOFTFP__) || defined(E500V2))
   return i->reg_num() >= LIR_OprDesc::vreg_base;
 #else
-  return i->reg_num() >= LIR_OprDesc::vreg_base && (i->type() != T_FLOAT && i->type() != T_DOUBLE);
+  return i->reg_num() >= LIR_OprDesc::vreg_base && (AARCH32_ONLY(!hasFPU() ||) (i->type() != T_FLOAT && i->type() != T_DOUBLE));
 #endif // __SOFTFP__ or E500V2
 }
 
@@ -191,10 +192,10 @@
 }
 
 bool LinearScan::is_virtual_fpu_interval(const Interval* i) {
-#if defined(__SOFTFP__) || defined(E500V2)
+#if !defined(AARCH32) && (defined(__SOFTFP__) || defined(E500V2))
   return false;
 #else
-  return i->reg_num() >= LIR_OprDesc::vreg_base && (i->type() == T_FLOAT || i->type() == T_DOUBLE);
+  return i->reg_num() >= LIR_OprDesc::vreg_base && (i->type() == T_FLOAT || i->type() == T_DOUBLE) AARCH32_ONLY(&& hasFPU());
 #endif // __SOFTFP__ or E500V2
 }
 
@@ -2100,6 +2101,13 @@
 
 #ifdef __SOFTFP__
       case T_FLOAT:  // fall through
+#if defined(AARCH32)
+      if(hasFPU()) {
+        assert(assigned_reg >= pd_first_fpu_reg && assigned_reg <= pd_last_fpu_reg, "no fpu register");
+        assert(interval->assigned_regHi() == any_reg, "must not have hi register");
+        return LIR_OprFact::single_fpu(assigned_reg - pd_first_fpu_reg);
+      }
+#endif
 #endif // __SOFTFP__
       case T_INT: {
         assert(assigned_reg >= pd_first_cpu_reg && assigned_reg <= pd_last_cpu_reg, "no cpu register");
@@ -2109,6 +2117,14 @@
 
 #ifdef __SOFTFP__
       case T_DOUBLE:  // fall through
+#if defined(AARCH32)
+        if(hasFPU()) {
+            assert(assigned_reg >= pd_first_fpu_reg && assigned_reg <= pd_last_fpu_reg, "no fpu register");
+            assert(interval->assigned_regHi() >= pd_first_fpu_reg && interval->assigned_regHi() <= pd_last_fpu_reg, "no fpu register");
+            assert(assigned_reg % 2 == 0 && assigned_reg + 1 == interval->assigned_regHi(), "must be sequential and even");
+            return LIR_OprFact::double_fpu(assigned_reg - pd_first_fpu_reg, interval->assigned_regHi() - pd_first_fpu_reg);
+        }
+#endif
 #endif // __SOFTFP__
       case T_LONG: {
         int assigned_regHi = interval->assigned_regHi();
@@ -2176,7 +2192,7 @@
         assert(interval->assigned_regHi() >= pd_first_fpu_reg && interval->assigned_regHi() <= pd_last_fpu_reg, "no fpu register");
         assert(assigned_reg % 2 == 0 && assigned_reg + 1 == interval->assigned_regHi(), "must be sequential and even");
         LIR_Opr result = LIR_OprFact::double_fpu(interval->assigned_regHi() - pd_first_fpu_reg, assigned_reg - pd_first_fpu_reg);
-#elif defined(ARM32)
+#elif defined(ARM32) || defined(AARCH32)
         assert(assigned_reg >= pd_first_fpu_reg && assigned_reg <= pd_last_fpu_reg, "no fpu register");
         assert(interval->assigned_regHi() >= pd_first_fpu_reg && interval->assigned_regHi() <= pd_last_fpu_reg, "no fpu register");
         assert(assigned_reg % 2 == 0 && assigned_reg + 1 == interval->assigned_regHi(), "must be sequential and even");
@@ -2774,7 +2790,7 @@
 #ifdef SPARC
       assert(opr->fpu_regnrLo() == opr->fpu_regnrHi() + 1, "assumed in calculation (only fpu_regnrHi is used)");
 #endif
-#ifdef ARM32
+#if defined(ARM32) || defined(AARCH32)
       assert(opr->fpu_regnrHi() == opr->fpu_regnrLo() + 1, "assumed in calculation (only fpu_regnrLo is used)");
 #endif
 #ifdef PPC32
--- old/src/hotspot/share/c1/c1_Runtime1.cpp	2018-09-25 19:23:38.000000000 +0300
+++ new/src/hotspot/share/c1/c1_Runtime1.cpp	2018-09-25 19:23:38.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -326,6 +327,16 @@
 #endif
   FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
   FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32C());
+#ifdef AARCH32
+  FUNCTION_CASE(entry, StubRoutines::aescrypt_encryptBlock());
+  FUNCTION_CASE(entry, StubRoutines::aescrypt_decryptBlock());
+  FUNCTION_CASE(entry, StubRoutines::cipherBlockChaining_encryptAESCrypt_special());
+  FUNCTION_CASE(entry, StubRoutines::cipherBlockChaining_decryptAESCrypt_special());
+  FUNCTION_CASE(entry, StubRoutines::updateBytesCRC32());
+  FUNCTION_CASE(entry, StubRoutines::sha1_implCompress());
+  FUNCTION_CASE(entry, StubRoutines::sha256_implCompress());
+  FUNCTION_CASE(entry, StubRoutines::sha512_implCompress());
+#endif
   FUNCTION_CASE(entry, StubRoutines::vectorizedMismatch());
   FUNCTION_CASE(entry, StubRoutines::dexp());
   FUNCTION_CASE(entry, StubRoutines::dlog());
--- old/src/hotspot/share/classfile/javaClasses.cpp	2018-09-25 19:23:39.000000000 +0300
+++ new/src/hotspot/share/classfile/javaClasses.cpp	2018-09-25 19:23:39.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -4273,6 +4274,14 @@
 int java_util_concurrent_locks_AbstractOwnableSynchronizer::_owner_offset;
 int reflect_ConstantPool::_oop_offset;
 int reflect_UnsafeStaticFieldAccessorImpl::_base_offset;
+#ifdef AARCH32
+int com_sun_crypto_provider_AESCrypt::_K_offset;
+int com_sun_crypto_provider_CipherBlockChaining::_r_offset;
+int com_sun_crypto_provider_FeedbackCipher::_embeddedCipher_offset;
+int sun_security_provider_SHA::_state_offset;
+int sun_security_provider_SHA2::_state_offset;
+int sun_security_provider_SHA5::_state_offset;
+#endif
 
 #define STACKTRACEELEMENT_FIELDS_DO(macro) \
   macro(declaringClassObject_offset,  k, "declaringClassObject", class_signature, false); \
@@ -4435,6 +4444,86 @@
   return (hardcoded_offset * heapOopSize) + instanceOopDesc::base_offset_in_bytes();
 }
 
+#ifdef AARCH32
+// Support for intrinsification of com.sun.crypto.provider.AESCrypto.encrypt
+#define AESCRYPT_FIELDS_DO(macro)                       \
+  macro(_K_offset, k, "K", int_array_signature, false)
+
+void com_sun_crypto_provider_AESCrypt::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::AESCrypt_klass();
+  AESCRYPT_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int com_sun_crypto_provider_AESCrypt::K_offset() {
+  return _K_offset;
+}
+
+// Support for intrinsification of com_sun_crypto_provider_CipherBlockChaining.encrypt
+#define CBC_FIELDS_DO(macro) \
+  macro(_r_offset, k, "r", byte_array_signature, false)
+
+void com_sun_crypto_provider_CipherBlockChaining::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::CipherBlockChaining_klass();
+  CBC_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int com_sun_crypto_provider_CipherBlockChaining::r_offset() {
+  return _r_offset;
+}
+
+// Support for intrinsification of com_sun_crypto_provider_CipherBlockChaining.encrypt
+#define FC_FIELDS_DO(macro) \
+  macro(_embeddedCipher_offset, k, "embeddedCipher", symmetriccipher_signature, false)
+
+void com_sun_crypto_provider_FeedbackCipher::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::FeedbackCipher_klass();
+  FC_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int com_sun_crypto_provider_FeedbackCipher::embeddedCipher_offset() {
+  return _embeddedCipher_offset;
+}
+
+// Support for intrinsification of sun_security_provider_SHA.implCompress
+#define SHA_FIELDS_DO(macro) \
+  macro(_state_offset, k, "state", int_array_signature, false)
+
+void sun_security_provider_SHA::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::SHA_klass();
+  SHA_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int sun_security_provider_SHA::state_offset() {
+  return _state_offset;
+}
+
+// Support for intrinsification of sun_security_provider_SHA2.implCompress
+#define SHA2_FIELDS_DO(macro) \
+  macro(_state_offset, k, "state", int_array_signature, false)
+
+void sun_security_provider_SHA2::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::SHA2_klass();
+  SHA2_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int sun_security_provider_SHA2::state_offset() {
+  return _state_offset;
+}
+
+// Support for intrinsification of sun_security_provider_SHA5.implCompress
+#define SHA5_FIELDS_DO(macro) \
+  macro(_state_offset, k, "state", long_array_signature, false)
+
+void sun_security_provider_SHA5::compute_offsets() {
+  InstanceKlass* k = SystemDictionary::SHA5_klass();
+  SHA5_FIELDS_DO(FIELD_COMPUTE_OFFSET);
+}
+
+int sun_security_provider_SHA5::state_offset() {
+  return _state_offset;
+}
+#endif
+
 // Compute hard-coded offsets
 // Invoked before SystemDictionary::initialize, so pre-loaded classes
 // are not available to determine the offset_of_static_fields.
@@ -4495,6 +4584,15 @@
 
   // generated interpreter code wants to know about the offsets we just computed:
   AbstractAssembler::update_delayed_values();
+
+#ifdef AARCH32
+  com_sun_crypto_provider_AESCrypt::compute_offsets();
+  com_sun_crypto_provider_FeedbackCipher::compute_offsets();
+  com_sun_crypto_provider_CipherBlockChaining::compute_offsets();
+  sun_security_provider_SHA::compute_offsets();
+  sun_security_provider_SHA2::compute_offsets();
+  sun_security_provider_SHA5::compute_offsets();
+#endif
 }
 
 #ifndef PRODUCT
--- old/src/hotspot/share/classfile/javaClasses.hpp	2018-09-25 19:23:41.000000000 +0300
+++ new/src/hotspot/share/classfile/javaClasses.hpp	2018-09-25 19:23:40.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -1491,6 +1492,114 @@
   static void serialize(SerializeClosure* f) NOT_CDS_RETURN;
 };
 
+#ifdef AARCH32
+class com_sun_crypto_provider_AESCrypt: AllStatic {
+ public:
+  static int _K_offset;
+  enum {
+   hc_ROUND_12_offset       = 0,
+   hc_ROUND_14_offset       = 1,
+   hc_sessionK_offset       = 2,
+   hc_K_offset              = 3,
+   hc_lastKey_offset        = 4,
+   hc_limit                 = 5
+  };
+
+  static int  K_offset();
+  static void compute_offsets();
+};
+
+class com_sun_crypto_provider_FeedbackCipher: AllStatic {
+ public:
+  static int _embeddedCipher_offset;
+  enum {
+   hc_blockSize_offset            = 0,
+   hc_embeddedCipher_offset       = 1,
+   hc_iv_offset
+  };
+
+  static int  embeddedCipher_offset();
+  static void compute_offsets();
+};
+
+class com_sun_crypto_provider_CipherBlockChaining:
+                            public com_sun_crypto_provider_FeedbackCipher {
+ public:
+  static int _r_offset;
+  enum {
+   hc_r_offset       = hc_iv_offset+1,
+   hc_k_offset
+  };
+
+  static int  r_offset();
+  static void compute_offsets();
+};
+
+class java_security_MessageDigestSpi {
+public:
+  enum {
+   hc_tempArray = 0
+  };
+};
+
+class sun_security_provider_DigestBase: java_security_MessageDigestSpi {
+public:
+  enum {
+    hc_digestLength = hc_tempArray + 1,
+    hc_bytesProcessed_low,
+    hc_bytesProcessed_high,
+    hc_blockSize,
+    hc_bufOfs,
+    hc_oneByte,
+    hc_algorithm,
+    hc_buffer
+  };
+};
+
+class sun_security_provider_SHA:
+                            public sun_security_provider_DigestBase {
+ public:
+  static int _state_offset;
+  enum {
+    hc_W  = hc_buffer + 1,
+    hc_state,
+    hc_init_hashes
+  };
+
+  static int state_offset();
+  static void compute_offsets();
+};
+
+
+class sun_security_provider_SHA2:
+                            public sun_security_provider_DigestBase {
+ public:
+  static int _state_offset;
+  enum {
+    hc_W  = hc_buffer + 1,
+    hc_state,
+    hc_init_hashes
+  };
+
+  static int state_offset();
+  static void compute_offsets();
+};
+
+class sun_security_provider_SHA5:
+                            public sun_security_provider_DigestBase {
+ public:
+  static int _state_offset;
+  enum {
+    hc_W  = hc_buffer + 1,
+    hc_state,
+    hc_init_hashes
+  };
+
+  static int state_offset();
+  static void compute_offsets();
+};
+#endif
+
 // Use to declare fields that need to be injected into Java classes
 // for the JVM to use.  The name_index and signature_index are
 // declared in vmSymbols.  The may_be_java flag is used to declare
--- old/src/hotspot/share/classfile/systemDictionary.hpp	2018-09-25 19:23:42.000000000 +0300
+++ new/src/hotspot/share/classfile/systemDictionary.hpp	2018-09-25 19:23:42.000000000 +0300
@@ -212,6 +212,14 @@
   do_klass(Integer_klass,                               java_lang_Integer,                         Pre                 ) \
   do_klass(Long_klass,                                  java_lang_Long,                            Pre                 ) \
                                                                                                                          \
+  /* In the name of the God of speed */                                                                                  \
+  AARCH32_ONLY(do_klass(AESCrypt_klass,                 com_sun_crypto_provider_aescrypt,          Opt                )) \
+  AARCH32_ONLY(do_klass(CipherBlockChaining_klass,      com_sun_crypto_provider_cipherBlockChaining, Opt              )) \
+  AARCH32_ONLY(do_klass(FeedbackCipher_klass,           com_sun_crypto_provider_feedbackcipher,    Opt                )) \
+  AARCH32_ONLY(do_klass(SHA_klass,                      sun_security_provider_sha,                 Opt                )) \
+  AARCH32_ONLY(do_klass(SHA2_klass,                     sun_security_provider_sha2,                Opt                )) \
+  AARCH32_ONLY(do_klass(SHA5_klass,                     sun_security_provider_sha5,                Opt                )) \
+                                                                                                                         \
   /* JVMCI classes. These are loaded on-demand. */                                                                       \
   JVMCI_WK_KLASSES_DO(do_klass)                                                                                          \
                                                                                                                          \
--- old/src/hotspot/share/classfile/vmSymbols.hpp	2018-09-25 19:23:43.000000000 +0300
+++ new/src/hotspot/share/classfile/vmSymbols.hpp	2018-09-25 19:23:43.000000000 +0300
@@ -321,6 +321,8 @@
   template(DEFAULT_CONTEXT_name,                      "DEFAULT_CONTEXT")                          \
   NOT_LP64(  do_alias(intptr_signature,               int_signature)  )                           \
   LP64_ONLY( do_alias(intptr_signature,               long_signature) )                           \
+  /* for the sake of the god of speed */ \
+  AARCH32_ONLY(template(com_sun_crypto_provider_feedbackcipher, "com/sun/crypto/provider/FeedbackCipher")) \
                                                                                                                                       \
   /* Support for JVMCI */                                                                                                             \
   JVMCI_VM_SYMBOLS_DO(template, do_alias)                                                         \
@@ -481,6 +483,7 @@
   template(byte_array_signature,                      "[B")                                       \
   template(char_array_signature,                      "[C")                                       \
   template(int_array_signature,                       "[I")                                       \
+  AARCH32_ONLY(template(long_array_signature,         "[J"))                                      \
   template(object_void_signature,                     "(Ljava/lang/Object;)V")                    \
   template(object_int_signature,                      "(Ljava/lang/Object;)I")                    \
   template(object_boolean_signature,                  "(Ljava/lang/Object;)Z")                    \
@@ -546,6 +549,7 @@
   template(int_String_signature,                      "(I)Ljava/lang/String;")                                    \
   template(boolean_boolean_int_signature,             "(ZZ)I")                                                    \
   template(codesource_permissioncollection_signature, "(Ljava/security/CodeSource;Ljava/security/PermissionCollection;)V") \
+  AARCH32_ONLY(template(symmetriccipher_signature,     "Lcom/sun/crypto/provider/SymmetricCipher;"))              \
   /* signature symbols needed by intrinsics */                                                                    \
   VM_INTRINSICS_DO(VM_INTRINSIC_IGNORE, VM_SYMBOL_IGNORE, VM_SYMBOL_IGNORE, template, VM_ALIAS_IGNORE)            \
                                                                                                                   \
--- old/src/hotspot/share/interpreter/abstractInterpreter.cpp	2018-09-25 19:23:44.000000000 +0300
+++ new/src/hotspot/share/interpreter/abstractInterpreter.cpp	2018-09-25 19:23:44.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -188,6 +189,37 @@
     default                   : break;
   }
 
+#ifdef AARCH32
+  if (UseAESIntrinsics) {
+    // Use optimized stub code for AES native methods.
+    switch (m->intrinsic_id()) {
+    case vmIntrinsics::_aescrypt_encryptBlock  : return com_sun_crypto_provider_AESCrypt_encryptBlock;
+    case vmIntrinsics::_aescrypt_decryptBlock  : return com_sun_crypto_provider_AESCrypt_decryptBlock;
+    }
+    // Use optimized stub code for AES CBC native methods.
+    if (StubRoutines::cipherBlockChaining_encryptAESCrypt_special() &&
+        m->intrinsic_id() == vmIntrinsics::_cipherBlockChaining_encryptAESCrypt)
+      return com_sun_crypto_provider_CipherBlockChaining_encrypt;
+
+    if (StubRoutines::cipherBlockChaining_decryptAESCrypt_special() &&
+        m->intrinsic_id() == vmIntrinsics::_cipherBlockChaining_decryptAESCrypt)
+      return com_sun_crypto_provider_CipherBlockChaining_decrypt;
+  }
+
+  // Use optimized stub code for SHA256/512 native methods.
+  switch (m->intrinsic_id()) {
+  case vmIntrinsics::_sha_implCompress :
+    if (UseSHA1Intrinsics) return sun_security_provider_SHA_implCompress;
+    break;
+  case vmIntrinsics::_sha2_implCompress :
+    if (UseSHA256Intrinsics) return sun_security_provider_SHA2_implCompress;
+    break;
+  case vmIntrinsics::_sha5_implCompress :
+    if (UseSHA512Intrinsics) return sun_security_provider_SHA5_implCompress;
+    break;
+  }
+#endif
+
   // Accessor method?
   if (m->is_getter()) {
     // TODO: We should have used ::is_accessor above, but fast accessors in Zero expect only getters.
@@ -282,6 +314,15 @@
     case java_util_zip_CRC32_updateByteBuffer : tty->print("java_util_zip_CRC32_updateByteBuffer"); break;
     case java_util_zip_CRC32C_updateBytes     : tty->print("java_util_zip_CRC32C_updateBytes"); break;
     case java_util_zip_CRC32C_updateDirectByteBuffer: tty->print("java_util_zip_CRC32C_updateDirectByteByffer"); break;
+#ifdef AARCH32
+    case com_sun_crypto_provider_AESCrypt_encryptBlock : tty->print("com_sun_crypto_provider_AESCrypt_encryptBlock"); break;
+    case com_sun_crypto_provider_AESCrypt_decryptBlock : tty->print("com_sun_crypto_provider_AESCrypt_decryptBlock"); break;
+    case com_sun_crypto_provider_CipherBlockChaining_encrypt : tty->print("com_sun_crypto_provider_CipherBlockChaining_encrypt"); break;
+    case com_sun_crypto_provider_CipherBlockChaining_decrypt : tty->print("com_sun_crypto_provider_CipherBlockChaining_decrypt"); break;
+    case sun_security_provider_SHA_implCompress : tty->print("sun_security_provider_SHA_implCompress"); break;
+    case sun_security_provider_SHA2_implCompress : tty->print("sun_security_provider_SHA2_implCompress"); break;
+    case sun_security_provider_SHA5_implCompress : tty->print("sun_security_provider_SHA5_implCompress"); break;
+#endif
     default:
       if (kind >= method_handle_invoke_FIRST &&
           kind <= method_handle_invoke_LAST) {
--- old/src/hotspot/share/interpreter/abstractInterpreter.hpp	2018-09-25 19:23:46.000000000 +0300
+++ new/src/hotspot/share/interpreter/abstractInterpreter.hpp	2018-09-25 19:23:45.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -89,6 +90,15 @@
     java_lang_Float_floatToRawIntBits,                          // implementation of java.lang.Float.floatToRawIntBits()
     java_lang_Double_longBitsToDouble,                          // implementation of java.lang.Double.longBitsToDouble()
     java_lang_Double_doubleToRawLongBits,                       // implementation of java.lang.Double.doubleToRawLongBits()
+#ifdef AARCH32
+    com_sun_crypto_provider_AESCrypt_encryptBlock,              // implementation of com/sun/crypto/provider/AESCrypt/encryptBlock()
+    com_sun_crypto_provider_AESCrypt_decryptBlock,              // implementation of com/sun/crypto/provider/AESCrypt/decryptBlock()
+    com_sun_crypto_provider_CipherBlockChaining_encrypt,        // implementation of com/sun/crypto/provider/CipherBlockChaining/encrypt()
+    com_sun_crypto_provider_CipherBlockChaining_decrypt,        // implementation of com/sun/crypto/provider/CipherBlockChaining/decrypt()
+    sun_security_provider_SHA_implCompress,                     // implementation of sun/security/provider/SHA2/implCompress()
+    sun_security_provider_SHA2_implCompress,                    // implementation of sun/security/provider/SHA2/implCompress()
+    sun_security_provider_SHA5_implCompress,                    // implementation of sun/security/provider/SHA5/implCompress()
+#endif
     number_of_method_entries,
     invalid = -1
   };
@@ -155,6 +165,13 @@
   // the compiled version to the intrinsic version.
   static bool       can_be_compiled(const methodHandle& m) {
     switch (m->intrinsic_id()) {
+#ifdef AARCH32
+      case vmIntrinsics::_aescrypt_encryptBlock:
+      case vmIntrinsics::_aescrypt_decryptBlock:
+      case vmIntrinsics::_sha_implCompress:
+      case vmIntrinsics::_sha2_implCompress:
+      case vmIntrinsics::_sha5_implCompress:
+#endif // AARCH32
       case vmIntrinsics::_dsin  : // fall thru
       case vmIntrinsics::_dcos  : // fall thru
       case vmIntrinsics::_dtan  : // fall thru
--- old/src/hotspot/share/interpreter/templateInterpreterGenerator.cpp	2018-09-25 19:23:47.000000000 +0300
+++ new/src/hotspot/share/interpreter/templateInterpreterGenerator.cpp	2018-09-25 19:23:46.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -227,6 +228,29 @@
   method_entry(java_lang_Double_longBitsToDouble);
   method_entry(java_lang_Double_doubleToRawLongBits);
 
+#ifdef AARCH32
+  if (UseAESIntrinsics) {
+    method_entry(com_sun_crypto_provider_AESCrypt_encryptBlock)
+    method_entry(com_sun_crypto_provider_AESCrypt_decryptBlock)
+    if (StubRoutines::cipherBlockChaining_encryptAESCrypt_special()) {
+      method_entry(com_sun_crypto_provider_CipherBlockChaining_encrypt)
+    }
+    if (StubRoutines::cipherBlockChaining_decryptAESCrypt_special()) {
+      method_entry(com_sun_crypto_provider_CipherBlockChaining_decrypt)
+    }
+  }
+
+  if (UseSHA1Intrinsics) {
+    method_entry(sun_security_provider_SHA_implCompress)
+  }
+  if (UseSHA256Intrinsics) {
+    method_entry(sun_security_provider_SHA2_implCompress)
+  }
+  if (UseSHA512Intrinsics) {
+    method_entry(sun_security_provider_SHA5_implCompress)
+  }
+#endif
+
 #undef method_entry
 
   // Bytecodes
@@ -460,6 +484,21 @@
     native = true;
     break;
 #endif // !IA32
+#ifdef AARCH32
+  case Interpreter::com_sun_crypto_provider_AESCrypt_encryptBlock:
+  case Interpreter::com_sun_crypto_provider_AESCrypt_decryptBlock:
+    entry_point = generate_aescrypt_block_entry(kind);
+    break;
+  case Interpreter::com_sun_crypto_provider_CipherBlockChaining_encrypt:
+  case Interpreter::com_sun_crypto_provider_CipherBlockChaining_decrypt:
+    // don't use AES CBC intrinsic in interpreter
+    break;
+  case Interpreter::sun_security_provider_SHA_implCompress:
+  case Interpreter::sun_security_provider_SHA2_implCompress:
+  case Interpreter::sun_security_provider_SHA5_implCompress:
+    entry_point = generate_SHA_implCompress_entry(kind);
+    break;
+#endif
   default:
     fatal("unexpected method kind: %d", kind);
     break;
--- old/src/hotspot/share/interpreter/templateInterpreterGenerator.hpp	2018-09-25 19:23:48.000000000 +0300
+++ new/src/hotspot/share/interpreter/templateInterpreterGenerator.hpp	2018-09-25 19:23:48.000000000 +0300
@@ -1,5 +1,6 @@
 /*
  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
  *
  * This code is free software; you can redistribute it and/or modify it
@@ -100,6 +101,12 @@
   address generate_Double_longBitsToDouble_entry();
   address generate_Double_doubleToRawLongBits_entry();
 #endif // IA32
+#ifdef AARCH32
+  address generate_CRC32_updateBytes_inner(AbstractInterpreter::MethodKind kind, int is_crc32c);
+  address generate_aescrypt_block_entry(AbstractInterpreter::MethodKind kind);
+  address generate_cipherBlockChaining_encryptAESCrypt_entry(AbstractInterpreter::MethodKind kind);
+  address generate_SHA_implCompress_entry(AbstractInterpreter::MethodKind kind);
+#endif
   // Some platforms don't need registers, other need two. Unused function is
   // left unimplemented.
   void generate_stack_overflow_check(void);
@@ -114,8 +121,10 @@
   void restore_native_result(void);
 #endif // SPARC
 
-#ifdef AARCH64
+#if defined(AARCH64)
   void generate_transcendental_entry(AbstractInterpreter::MethodKind kind, int fpargs);
+#elif defined(AARCH32)
+  void generate_transcendental_entry(AbstractInterpreter::MethodKind kind);
 #endif // AARCH64
 
 #ifdef PPC
--- old/src/hotspot/share/jfr/recorder/repository/jfrChunkWriter.cpp	2018-09-25 19:23:49.000000000 +0300
+++ new/src/hotspot/share/jfr/recorder/repository/jfrChunkWriter.cpp	2018-09-25 19:23:49.000000000 +0300
@@ -89,11 +89,11 @@
 void JfrChunkWriter::write_header(intptr_t metadata_offset) {
   assert(this->is_valid(), "invariant");
   // Chunk size
-  this->write_be_at_offset(size_written(), CHUNK_SIZE_OFFSET);
+  this->write_be_at_offset((jlong)size_written(), CHUNK_SIZE_OFFSET);
   // initial checkpoint event offset
   this->write_be_at_offset(_chunkstate->previous_checkpoint_offset(), CHUNK_SIZE_OFFSET + (1 * FILEHEADER_SLOT_SIZE));
   // metadata event offset
-  this->write_be_at_offset(metadata_offset, CHUNK_SIZE_OFFSET + (2 * FILEHEADER_SLOT_SIZE));
+  this->write_be_at_offset((jlong)metadata_offset, CHUNK_SIZE_OFFSET + (2 * FILEHEADER_SLOT_SIZE));
   // start of chunk in nanos since epoch
   this->write_be_at_offset(_chunkstate->previous_start_nanos(), CHUNK_SIZE_OFFSET + (3 * FILEHEADER_SLOT_SIZE));
   // duration of chunk in nanos
--- old/src/hotspot/share/jfr/writers/jfrWriterHost.inline.hpp	2018-09-25 19:23:50.000000000 +0300
+++ new/src/hotspot/share/jfr/writers/jfrWriterHost.inline.hpp	2018-09-25 19:23:50.000000000 +0300
@@ -196,7 +196,7 @@
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
 inline void WriterHost<BE, IE, WriterPolicyImpl>::write(double value) {
-  be_write(*(uintptr_t*)&(value));
+  be_write(*(u8*)&(value));
 }
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
@@ -277,22 +277,22 @@
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
 void WriterHost<BE, IE, WriterPolicyImpl>::write(const Ticks& time) {
-  write((uintptr_t)JfrTime::is_ft_enabled() ? time.ft_value() : time.value());
+  write((u8)JfrTime::is_ft_enabled() ? time.ft_value() : time.value());
 }
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
 void WriterHost<BE, IE, WriterPolicyImpl>::write(const Tickspan& time) {
-  write((uintptr_t)JfrTime::is_ft_enabled() ? time.ft_value() : time.value());
+  write((u8)JfrTime::is_ft_enabled() ? time.ft_value() : time.value());
 }
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
 void WriterHost<BE, IE, WriterPolicyImpl>::write(const JfrTicks& time) {
-  write((uintptr_t)time.value());
+  write((u8)time.value());
 }
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
 void WriterHost<BE, IE, WriterPolicyImpl>::write(const JfrTickspan& time) {
-  write((uintptr_t)time.value());
+  write((u8)time.value());
 }
 
 template <typename BE, typename IE, typename WriterPolicyImpl>
--- old/src/hotspot/share/opto/c2_globals.hpp	2018-09-25 19:23:52.000000000 +0300
+++ new/src/hotspot/share/opto/c2_globals.hpp	2018-09-25 19:23:51.000000000 +0300
@@ -719,11 +719,11 @@
   diagnostic(bool, UseMulAddIntrinsic, false,                               \
           "Enables intrinsification of BigInteger.mulAdd()")                \
                                                                             \
-  diagnostic(bool, UseMontgomeryMultiplyIntrinsic, false,                   \
-          "Enables intrinsification of BigInteger.montgomeryMultiply()")    \
+  NOT_AARCH32(diagnostic(bool, UseMontgomeryMultiplyIntrinsic, false,       \
+          "Enables intrinsification of BigInteger.montgomeryMultiply()"))   \
                                                                             \
-  diagnostic(bool, UseMontgomerySquareIntrinsic, false,                     \
-          "Enables intrinsification of BigInteger.montgomerySquare()")      \
+  NOT_AARCH32(diagnostic(bool, UseMontgomerySquareIntrinsic, false,         \
+          "Enables intrinsification of BigInteger.montgomerySquare()"))     \
                                                                             \
   product(bool, UseTypeSpeculation, true,                                   \
           "Speculatively propagate types from profiles")                    \
--- old/src/hotspot/share/opto/c2compiler.cpp	2018-09-25 19:23:53.000000000 +0300
+++ new/src/hotspot/share/opto/c2compiler.cpp	2018-09-25 19:23:53.000000000 +0300
@@ -578,7 +578,9 @@
   case vmIntrinsics::_sha_implCompress:
   case vmIntrinsics::_sha2_implCompress:
   case vmIntrinsics::_sha5_implCompress:
+#ifndef AARCH32
   case vmIntrinsics::_digestBase_implCompressMB:
+#endif
   case vmIntrinsics::_multiplyToLen:
   case vmIntrinsics::_squareToLen:
   case vmIntrinsics::_mulAdd:
--- old/src/hotspot/share/opto/library_call.cpp	2018-09-25 19:23:54.000000000 +0300
+++ new/src/hotspot/share/opto/library_call.cpp	2018-09-25 19:23:54.000000000 +0300
@@ -5673,7 +5673,11 @@
 bool LibraryCallKit::inline_aescrypt_Block(vmIntrinsics::ID id) {
   address stubAddr = NULL;
   const char *stubName;
+#ifdef AARCH32
+  assert(UseAESIntrinsics, "sanity");
+#else
   assert(UseAES, "need AES instruction support");
+#endif
 
   switch(id) {
   case vmIntrinsics::_aescrypt_encryptBlock:
@@ -5742,7 +5746,11 @@
   address stubAddr = NULL;
   const char *stubName = NULL;
 
+#ifdef AARCH32
+  assert(UseAESIntrinsics, "sanity");
+#else
   assert(UseAES, "need AES instruction support");
+#endif
 
   switch(id) {
   case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
@@ -5811,6 +5819,9 @@
   Node* objRvec = load_field_from_object(cipherBlockChaining_object, "r", "[B", /*is_exact*/ false);
   if (objRvec == NULL) return false;
   Node* r_start = array_element_address(objRvec, intcon(0), T_BYTE);
+#ifdef AARCH32
+  if (r_start == NULL) return false;
+#endif
 
   Node* cbcCrypt;
   if (Matcher::pass_original_key_for_aes()) {
--- old/src/hotspot/share/runtime/globals.hpp	2018-09-25 19:23:56.000000000 +0300
+++ new/src/hotspot/share/runtime/globals.hpp	2018-09-25 19:23:55.000000000 +0300
@@ -253,8 +253,9 @@
   /* UseMembar is theoretically a temp flag used for memory barrier      */ \
   /* removal testing.  It was supposed to be removed before FCS but has  */ \
   /* been re-added (see 6401008)                                         */ \
+  NOT_AARCH32(                                                              \
   product_pd(bool, UseMembar,                                               \
-          "(Unstable) Issues membars on thread state transitions")          \
+          "(Unstable) Issues membars on thread state transitions"))         \
                                                                             \
   develop(bool, CleanChunkPoolAsync, true,                                  \
           "Clean the chunk pool asynchronously")                            \
@@ -444,6 +445,12 @@
   diagnostic(bool, UseAESCTRIntrinsics, false,                              \
           "Use intrinsics for the paralleled version of AES/CTR crypto")    \
                                                                             \
+  AARCH32_ONLY(diagnostic(bool, UseMontgomeryMultiplyIntrinsic, false,      \
+          "Enables intrinsification of BigInteger.montgomeryMultiply()"))   \
+                                                                            \
+  AARCH32_ONLY(diagnostic(bool, UseMontgomerySquareIntrinsic, false,        \
+          "Enables intrinsification of BigInteger.montgomerySquare()"))     \
+                                                                            \
   diagnostic(bool, UseSHA1Intrinsics, false,                                \
           "Use intrinsics for SHA-1 crypto hash function. "                 \
           "Requires that UseSHA is enabled.")                               \
--- old/src/hotspot/share/runtime/synchronizer.cpp	2018-09-25 19:23:57.000000000 +0300
+++ new/src/hotspot/share/runtime/synchronizer.cpp	2018-09-25 19:23:57.000000000 +0300
@@ -220,11 +220,6 @@
     // Case: light contention possibly amenable to TLE
     // Case: TLE inimical operations such as nested/recursive synchronization
 
-    if (owner == Self) {
-      m->_recursions++;
-      return true;
-    }
-
     // This Java Monitor is inflated so obj's header will never be
     // displaced to this thread's BasicLock. Make the displaced header
     // non-NULL so this BasicLock is not seen as recursive nor as
@@ -237,6 +232,11 @@
     // and last are the inflated Java Monitor (ObjectMonitor) checks.
     lock->set_displaced_header(markOopDesc::unused_mark());
 
+    if (owner == Self) {
+      m->_recursions++;
+      return true;
+    }
+
     if (owner == NULL && Atomic::replace_if_null(Self, &(m->_owner))) {
       assert(m->_recursions == 0, "invariant");
       assert(m->_owner == Self, "invariant");
--- old/src/hotspot/share/runtime/vmStructs.cpp	2018-09-25 19:23:59.000000000 +0300
+++ new/src/hotspot/share/runtime/vmStructs.cpp	2018-09-25 19:23:59.000000000 +0300
@@ -607,8 +607,16 @@
      static_field(StubRoutines,                _call_stub_return_address,                     address)                               \
      static_field(StubRoutines,                _aescrypt_encryptBlock,                        address)                               \
      static_field(StubRoutines,                _aescrypt_decryptBlock,                        address)                               \
+AARCH32_ONLY(static_field(StubRoutines,        _aes_table_te_addr,                            address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _aes_table_td_addr,                            address))                              \
      static_field(StubRoutines,                _cipherBlockChaining_encryptAESCrypt,          address)                               \
      static_field(StubRoutines,                _cipherBlockChaining_decryptAESCrypt,          address)                               \
+AARCH32_ONLY(static_field(StubRoutines,        _sha1_implCompress,                            address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _sha1_table_addr,                              address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _sha256_implCompress,                          address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _sha256_table_addr,                            address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _sha512_implCompress,                          address))                              \
+AARCH32_ONLY(static_field(StubRoutines,        _sha512_table_addr,                            address))                              \
      static_field(StubRoutines,                _counterMode_AESCrypt,                         address)                               \
      static_field(StubRoutines,                _ghash_processBlocks,                          address)                               \
      static_field(StubRoutines,                _base64_encodeBlock,                           address)                               \
--- old/src/hotspot/share/utilities/macros.hpp	2018-09-25 19:24:00.000000000 +0300
+++ new/src/hotspot/share/utilities/macros.hpp	2018-09-25 19:24:00.000000000 +0300
@@ -541,10 +541,11 @@
 #define NOT_E500V2(code) code
 #endif
 
-// Note: There are three ARM ports. They set the following in the makefiles:
+// Note: There are four ARM ports. They set the following in the makefiles:
 // 1. Closed 32-bit port:   -DARM -DARM32           -DTARGET_ARCH_arm
 // 2. Closed 64-bit port:   -DARM -DAARCH64 -D_LP64 -DTARGET_ARCH_arm
 // 3. Open   64-bit port:         -DAARCH64 -D_LP64 -DTARGET_ARCH_aaarch64
+// 4. Open   32-bit port:         -DAARCH32         -DTARGET_ARCH_aarch32
 #ifdef ARM
 #define ARM_ONLY(code) code
 #define NOT_ARM(code)
@@ -577,6 +578,14 @@
 #define BIG_ENDIAN_ONLY(code) code
 #endif
 
+#ifdef AARCH32
+#define AARCH32_ONLY(code) code
+#define NOT_AARCH32(code)
+#else
+#define AARCH32_ONLY(code)
+#define NOT_AARCH32(code) code
+#endif
+
 #define define_pd_global(type, name, value) const type pd_##name = value;
 
 // Helper macros for constructing file names for includes.
--- old/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/HotSpotAgent.java	2018-09-25 19:24:01.000000000 +0300
+++ new/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/HotSpotAgent.java	2018-09-25 19:24:01.000000000 +0300
@@ -35,6 +35,7 @@
 import sun.jvm.hotspot.debugger.MachineDescriptionAMD64;
 import sun.jvm.hotspot.debugger.MachineDescriptionPPC64;
 import sun.jvm.hotspot.debugger.MachineDescriptionAArch64;
+import sun.jvm.hotspot.debugger.MachineDescriptionArm;
 import sun.jvm.hotspot.debugger.MachineDescriptionIntelX86;
 import sun.jvm.hotspot.debugger.MachineDescriptionSPARC32Bit;
 import sun.jvm.hotspot.debugger.MachineDescriptionSPARC64Bit;
@@ -589,6 +590,8 @@
             machDesc = new MachineDescriptionPPC64();
         } else if (cpu.equals("aarch64")) {
             machDesc = new MachineDescriptionAArch64();
+        } else if (cpu.equals("arm")) {
+            machDesc = new MachineDescriptionArm();
         } else if (cpu.equals("sparc")) {
             if (LinuxDebuggerLocal.getAddressSize()==8) {
                     machDesc = new MachineDescriptionSPARC64Bit();
--- old/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	2018-09-25 19:24:02.000000000 +0300
+++ new/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/utilities/PlatformInfo.java	2018-09-25 19:24:02.000000000 +0300
@@ -54,7 +54,7 @@
 
   public static boolean knownCPU(String cpu) {
     final String[] KNOWN =
-        new String[] {"i386", "x86", "x86_64", "amd64", "sparc", "sparcv9", "ppc64", "ppc64le", "aarch64"};
+      new String[] {"i386", "x86", "x86_64", "amd64", "sparc", "sparcv9", "ppc64", "ppc64le", "aarch64", "arm"};
 
     for(String s : KNOWN) {
       if(s.equals(cpu))
--- old/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	2018-09-25 19:24:04.000000000 +0300
+++ new/test/hotspot/jtreg/compiler/testlibrary/sha/predicate/IntrinsicPredicates.java	2018-09-25 19:24:03.000000000 +0300
@@ -94,6 +94,9 @@
               new OrPredicate(new CPUSpecificPredicate("amd64.*",   new String[] { "avx2", "bmi2" }, null),
                               new CPUSpecificPredicate("x86_64",    new String[] { "avx2", "bmi2" }, null))))))))));
 
+    public static final BooleanSupplier AARCH32_NEON_AVAILABLE
+            =                 new CPUSpecificPredicate("arm.*",     new String[] { "neon" },         null);
+
     public static final BooleanSupplier ANY_SHA_INSTRUCTION_AVAILABLE
             = new OrPredicate(IntrinsicPredicates.SHA1_INSTRUCTION_AVAILABLE,
                     new OrPredicate(
--- old/test/hotspot/jtreg/runtime/ReservedStack/ReservedStackTest.java	2018-09-25 19:24:05.000000000 +0300
+++ new/test/hotspot/jtreg/runtime/ReservedStack/ReservedStackTest.java	2018-09-25 19:24:05.000000000 +0300
@@ -239,7 +239,7 @@
         return Platform.isAix() ||
             (Platform.isLinux() &&
              (Platform.isPPC() || Platform.isS390x() || Platform.isX64() ||
-              Platform.isX86())) ||
+              Platform.isX86()) || Platform.isARM()) ||
             Platform.isOSX() ||
             Platform.isSolaris();
     }
--- /dev/null	2018-09-25 19:24:06.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/aarch32.ad	2018-09-25 19:24:06.000000000 +0300
@@ -0,0 +1,11817 @@
+//
+// Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+//
+
+// AARCH32 Architecture Description File
+
+//----------REGISTER DEFINITION BLOCK------------------------------------------
+// This information is used by the matcher and the register allocator to
+// describe individual registers and classes of registers within the target
+// archtecture.
+register %{
+//----------Architecture Description Register Definitions----------------------
+// General Registers
+// "reg_def"  name ( register save type, C convention save type,
+//                   ideal register type, encoding, vm name );
+// Register Save Types:
+//
+// NS  = No-Save:       The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method, &
+//                      that they do not need to be saved at call sites.
+//
+// SOC = Save-On-Call:  The register allocator assumes that these registers
+//                      can be used without saving upon entry to the method,
+//                      but that they must be saved at call sites.
+//
+// SOE = Save-On-Entry: The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, but they do not need to be saved at call
+//                      sites.
+//
+// AS  = Always-Save:   The register allocator assumes that these registers
+//                      must be saved before using them upon entry to the
+//                      method, & that they must be saved at call sites.
+//
+// Ideal Register Type is used to determine how to save & restore a
+// register.  Op_RegI will get spilled with LoadI/StoreI, Op_RegP will get
+// spilled with LoadP/StoreP.  If the register supports both, use Op_RegI.
+//
+// The encoding number is the actual bit-pattern placed into the opcodes.
+
+
+// ----------------------------
+// Integer/Long Registers
+// ----------------------------
+
+reg_def R_R0 (SOC, SOC, Op_RegI,  0,  R(0)->as_VMReg());
+reg_def R_R1 (SOC, SOC, Op_RegI,  1,  R(1)->as_VMReg());
+reg_def R_R2 (SOC, SOC, Op_RegI,  2,  R(2)->as_VMReg());
+reg_def R_R3 (SOC, SOC, Op_RegI,  3,  R(3)->as_VMReg());
+reg_def R_R4 (SOC, SOE, Op_RegI,  4,  R(4)->as_VMReg());
+reg_def R_R5 (SOC, SOE, Op_RegI,  5,  R(5)->as_VMReg());
+reg_def R_R6 (SOC, SOE, Op_RegI,  6,  R(6)->as_VMReg());
+reg_def R_R7 (SOC, SOE, Op_RegI,  7,  R(7)->as_VMReg());
+reg_def R_R8 (SOC, SOE, Op_RegI,  8,  R(8)->as_VMReg());
+reg_def R_R9 (SOC, SOE, Op_RegI,  9,  R(9)->as_VMReg());
+reg_def R_R10(NS,  SOE, Op_RegI, 10, R(10)->as_VMReg());
+reg_def R_R11(NS,  SOE, Op_RegI, 11, R(11)->as_VMReg());
+reg_def R_R12(SOC, SOC, Op_RegI, 12, R(12)->as_VMReg());
+reg_def R_R13(NS,  NS,  Op_RegI, 13, R(13)->as_VMReg());
+reg_def R_R14(SOC, SOC, Op_RegI, 14, R(14)->as_VMReg());
+reg_def R_R15(NS,  NS,  Op_RegI, 15, R(15)->as_VMReg());
+
+// ----------------------------
+// Float/Double Registers
+// ----------------------------
+
+// Float Registers
+
+reg_def R_S0 ( SOC, SOC, Op_RegF,  0, f0->as_VMReg());
+reg_def R_S1 ( SOC, SOC, Op_RegF,  1, f1->as_VMReg());
+reg_def R_S2 ( SOC, SOC, Op_RegF,  2, f2->as_VMReg());
+reg_def R_S3 ( SOC, SOC, Op_RegF,  3, f3->as_VMReg());
+reg_def R_S4 ( SOC, SOC, Op_RegF,  4, f4->as_VMReg());
+reg_def R_S5 ( SOC, SOC, Op_RegF,  5, f5->as_VMReg());
+reg_def R_S6 ( SOC, SOC, Op_RegF,  6, f6->as_VMReg());
+reg_def R_S7 ( SOC, SOC, Op_RegF,  7, f7->as_VMReg());
+reg_def R_S8 ( SOC, SOC, Op_RegF,  8, f8->as_VMReg());
+reg_def R_S9 ( SOC, SOC, Op_RegF,  9, f9->as_VMReg());
+reg_def R_S10( SOC, SOC, Op_RegF, 10,f10->as_VMReg());
+reg_def R_S11( SOC, SOC, Op_RegF, 11,f11->as_VMReg());
+reg_def R_S12( SOC, SOC, Op_RegF, 12,f12->as_VMReg());
+reg_def R_S13( SOC, SOC, Op_RegF, 13,f13->as_VMReg());
+reg_def R_S14( SOC, SOC, Op_RegF, 14,f14->as_VMReg());
+reg_def R_S15( SOC, SOC, Op_RegF, 15,f15->as_VMReg());
+reg_def R_S16( SOC, SOE, Op_RegF, 16,f16->as_VMReg());
+reg_def R_S17( SOC, SOE, Op_RegF, 17,f17->as_VMReg());
+reg_def R_S18( SOC, SOE, Op_RegF, 18,f18->as_VMReg());
+reg_def R_S19( SOC, SOE, Op_RegF, 19,f19->as_VMReg());
+reg_def R_S20( SOC, SOE, Op_RegF, 20,f20->as_VMReg());
+reg_def R_S21( SOC, SOE, Op_RegF, 21,f21->as_VMReg());
+reg_def R_S22( SOC, SOE, Op_RegF, 22,f22->as_VMReg());
+reg_def R_S23( SOC, SOE, Op_RegF, 23,f23->as_VMReg());
+reg_def R_S24( SOC, SOE, Op_RegF, 24,f24->as_VMReg());
+reg_def R_S25( SOC, SOE, Op_RegF, 25,f25->as_VMReg());
+reg_def R_S26( SOC, SOE, Op_RegF, 26,f26->as_VMReg());
+reg_def R_S27( SOC, SOE, Op_RegF, 27,f27->as_VMReg());
+reg_def R_S28( SOC, SOE, Op_RegF, 28,f28->as_VMReg());
+reg_def R_S29( SOC, SOE, Op_RegF, 29,f29->as_VMReg());
+reg_def R_S30( SOC, SOE, Op_RegF, 30,f30->as_VMReg());
+reg_def R_S31( SOC, SOE, Op_RegF, 31,f31->as_VMReg());
+
+// Double Registers
+// The rules of ADL require that double registers be defined in pairs.
+// Each pair must be two 32-bit values, but not necessarily a pair of
+// single float registers.  In each pair, ADLC-assigned register numbers
+// must be adjacent, with the lower number even.  Finally, when the
+// CPU stores such a register pair to memory, the word associated with
+// the lower ADLC-assigned number must be stored to the lower address.
+
+// TODO, the problem is that AArch32 port has same same numeric value for
+// d16->as_VMReg and f1->as_VMReg which breaks reverse mapping from
+// VMReg to OptoReg
+// reg_def R_D16 (SOC, SOC, Op_RegD, 32, d16->as_VMReg());
+// reg_def R_D16x(SOC, SOC, Op_RegD,255, d16->as_VMReg()->next());
+// reg_def R_D17 (SOC, SOC, Op_RegD, 34, d17->as_VMReg());
+// reg_def R_D17x(SOC, SOC, Op_RegD,255, d17->as_VMReg()->next());
+// reg_def R_D18 (SOC, SOC, Op_RegD, 36, d18->as_VMReg());
+// reg_def R_D18x(SOC, SOC, Op_RegD,255, d18->as_VMReg()->next());
+// reg_def R_D19 (SOC, SOC, Op_RegD, 38, d19->as_VMReg());
+// reg_def R_D19x(SOC, SOC, Op_RegD,255, d19->as_VMReg()->next());
+// reg_def R_D20 (SOC, SOC, Op_RegD, 40, d20->as_VMReg());
+// reg_def R_D20x(SOC, SOC, Op_RegD,255, d20->as_VMReg()->next());
+// reg_def R_D21 (SOC, SOC, Op_RegD, 42, d21->as_VMReg());
+// reg_def R_D21x(SOC, SOC, Op_RegD,255, d21->as_VMReg()->next());
+// reg_def R_D22 (SOC, SOC, Op_RegD, 44, d22->as_VMReg());
+// reg_def R_D22x(SOC, SOC, Op_RegD,255, d22->as_VMReg()->next());
+// reg_def R_D23 (SOC, SOC, Op_RegD, 46, d23->as_VMReg());
+// reg_def R_D23x(SOC, SOC, Op_RegD,255, d23->as_VMReg()->next());
+// reg_def R_D24 (SOC, SOC, Op_RegD, 48, d24->as_VMReg());
+// reg_def R_D24x(SOC, SOC, Op_RegD,255, d24->as_VMReg()->next());
+// reg_def R_D25 (SOC, SOC, Op_RegD, 50, d25->as_VMReg());
+// reg_def R_D25x(SOC, SOC, Op_RegD,255, d25->as_VMReg()->next());
+// reg_def R_D26 (SOC, SOC, Op_RegD, 52, d26->as_VMReg());
+// reg_def R_D26x(SOC, SOC, Op_RegD,255, d26->as_VMReg()->next());
+// reg_def R_D27 (SOC, SOC, Op_RegD, 54, d27->as_VMReg());
+// reg_def R_D27x(SOC, SOC, Op_RegD,255, d27->as_VMReg()->next());
+// reg_def R_D28 (SOC, SOC, Op_RegD, 56, d28->as_VMReg());
+// reg_def R_D28x(SOC, SOC, Op_RegD,255, d28->as_VMReg()->next());
+// reg_def R_D29 (SOC, SOC, Op_RegD, 58, d29->as_VMReg());
+// reg_def R_D29x(SOC, SOC, Op_RegD,255, d29->as_VMReg()->next());
+// reg_def R_D30 (SOC, SOC, Op_RegD, 60, d30->as_VMReg());
+// reg_def R_D30x(SOC, SOC, Op_RegD,255, d30->as_VMReg()->next());
+// reg_def R_D31 (SOC, SOC, Op_RegD, 62, d31->as_VMReg());
+// reg_def R_D31x(SOC, SOC, Op_RegD,255, d31->as_VMReg()->next());
+
+// ----------------------------
+// Special Registers
+// Condition Codes Flag Registers
+reg_def APSR (SOC, SOC,  Op_RegFlags, 0, VMRegImpl::Bad());
+reg_def FPSCR(SOC, SOC,  Op_RegFlags, 0, VMRegImpl::Bad());
+
+// ----------------------------
+// Specify the enum values for the registers.  These enums are only used by the
+// OptoReg "class". We can convert these enum values at will to VMReg when needed
+// for visibility to the rest of the vm. The order of this enum influences the
+// register allocator so having the freedom to set this order and not be stuck
+// with the order that is natural for the rest of the vm is worth it.
+
+// registers in that order so that R11/R12 is an aligned pair that can be used for longs
+alloc_class chunk0(
+                   R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R10, R_R13, R_R14, R_R15, R_R0, R_R1, R_R2, R_R3);
+
+// Note that a register is not allocatable unless it is also mentioned
+// in a widely-used reg_class below.
+
+alloc_class chunk1(
+                   R_S16, R_S17, R_S18, R_S19, R_S20, R_S21, R_S22, R_S23,
+                   R_S24, R_S25, R_S26, R_S27, R_S28, R_S29, R_S30, R_S31,
+                   R_S0,  R_S1,  R_S2,  R_S3,  R_S4,  R_S5,  R_S6,  R_S7,
+                   R_S8,  R_S9,  R_S10, R_S11, R_S12, R_S13, R_S14, R_S15
+                   // ,
+                   // R_D16, R_D16x,R_D17, R_D17x,R_D18, R_D18x,R_D19, R_D19x,
+                   // R_D20, R_D20x,R_D21, R_D21x,R_D22, R_D22x,R_D23, R_D23x,
+                   // R_D24, R_D24x,R_D25, R_D25x,R_D26, R_D26x,R_D27, R_D27x,
+                   // R_D28, R_D28x,R_D29, R_D29x,R_D30, R_D30x,R_D31, R_D31x
+);
+
+alloc_class chunk2(APSR, FPSCR);
+
+//----------Architecture Description Register Classes--------------------------
+// Several register classes are automatically defined based upon information in
+// this architecture description.
+// 1) reg_class inline_cache_reg           ( as defined in frame section )
+// 2) reg_class interpreter_method_oop_reg ( as defined in frame section )
+// 3) reg_class stack_slots( /* one chunk of stack-based "registers" */ )
+//
+
+// ----------------------------
+// Integer Register Classes
+// ----------------------------
+// Exclusions from i_reg:
+// sp (R13), PC (R15)
+// R10: reserved by HotSpot to the TLS register (invariant within Java)
+reg_class int_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R14);
+
+reg_class R0_regI(R_R0);
+reg_class R1_regI(R_R1);
+reg_class R2_regI(R_R2);
+reg_class R3_regI(R_R3);
+reg_class R9_regI(R_R9);
+reg_class R12_regI(R_R12);
+
+// ----------------------------
+// Pointer Register Classes
+// ----------------------------
+reg_class ptr_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R8, R_R9, R_R11, R_R12, R_R14);
+// Special class for storeP instructions, which can store SP or RPC to TLS.
+// It is also used for memory addressing, allowing direct TLS addressing.
+reg_class sp_ptr_reg(R_R0, R_R1, R_R2, R_R3, R_R4, R_R5, R_R6, R_R7, R_R9, R_R11, R_R12, R_R14, R_R8, R_R10 /* TLS*/, R_R13 /* SP*/);
+
+#define R_Ricklass R_R12
+#define R_Rmethod  R_R8
+#define R_Rthread  R_R10
+#define R_Rexception_obj R_R0
+
+// Other special pointer regs
+reg_class R0_regP(R_R0);
+reg_class R1_regP(R_R1);
+reg_class R2_regP(R_R2);
+reg_class R4_regP(R_R4);
+reg_class Rexception_regP(R_Rexception_obj);
+reg_class Ricklass_regP(R_Ricklass);
+reg_class Rmethod_regP(R_Rmethod);
+reg_class Rthread_regP(R_Rthread);
+reg_class IP_regP(R_R12);
+reg_class LR_regP(R_R14);
+
+reg_class FP_regP(R_R11);
+
+// ----------------------------
+// Long Register Classes
+// ----------------------------
+reg_class long_reg (             R_R0,R_R1, R_R2,R_R3, R_R4,R_R5, R_R6,R_R7, R_R8,R_R9, R_R11,R_R12);
+// for ldrexd, strexd: first reg of pair must be even
+reg_class long_reg_align (       R_R0,R_R1, R_R2,R_R3, R_R4,R_R5, R_R6,R_R7, R_R8,R_R9);
+
+reg_class R0R1_regL(R_R0,R_R1);
+reg_class R2R3_regL(R_R2,R_R3);
+
+// ----------------------------
+// Special Class for Condition Code Flags Register
+reg_class int_flags(APSR);
+reg_class float_flags(FPSCR);
+
+
+// ----------------------------
+// Float Point Register Classes
+// ----------------------------
+// Skip f14/f15, they are reserved for mem-mem copies
+reg_class sflt_reg(R_S0, R_S1, R_S2, R_S3, R_S4, R_S5, R_S6, R_S7, R_S8, R_S9, R_S10, R_S11, R_S12, R_S13,
+                   R_S16, R_S17, R_S18, R_S19, R_S20, R_S21, R_S22, R_S23, R_S24, R_S25, R_S26, R_S27, R_S28, R_S29, R_S30, R_S31);
+
+// Paired floating point registers--they show up in the same order as the floats,
+// but they are used with the "Op_RegD" type, and always occur in even/odd pairs.
+reg_class dflt_reg(R_S0,R_S1, R_S2,R_S3, R_S4,R_S5, R_S6,R_S7, R_S8,R_S9, R_S10,R_S11, R_S12,R_S13,
+                   R_S16,R_S17, R_S18,R_S19, R_S20,R_S21, R_S22,R_S23, R_S24,R_S25, R_S26,R_S27, R_S28,R_S29, R_S30,R_S31
+                   // ,
+                   // R_D16,R_D16x, R_D17,R_D17x, R_D18,R_D18x, R_D19,R_D19x, R_D20,R_D20x, R_D21,R_D21x, R_D22,R_D22x,
+                   // R_D23,R_D23x, R_D24,R_D24x, R_D25,R_D25x, R_D26,R_D26x, R_D27,R_D27x, R_D28,R_D28x, R_D29,R_D29x,
+                   // R_D30,R_D30x, R_D31,R_D31x
+  );
+
+reg_class dflt_low_reg(R_S0,R_S1, R_S2,R_S3, R_S4,R_S5, R_S6,R_S7, R_S8,R_S9, R_S10,R_S11, R_S12,R_S13,
+                       R_S16,R_S17, R_S18,R_S19, R_S20,R_S21, R_S22,R_S23, R_S24,R_S25, R_S26,R_S27, R_S28,R_S29, R_S30,R_S31);
+
+
+reg_class actual_dflt_reg %{
+  if (/*VM_Version::features() & FT_VFPV3D32*/0) { // TODO verify and enable
+    return DFLT_REG_mask();
+  } else {
+    return DFLT_LOW_REG_mask();
+  }
+%}
+
+reg_class f0_regF(R_S0);
+reg_class D0_regD(R_S0,R_S1);
+reg_class D1_regD(R_S2,R_S3);
+reg_class D2_regD(R_S4,R_S5);
+reg_class D3_regD(R_S6,R_S7);
+reg_class D4_regD(R_S8,R_S9);
+reg_class D5_regD(R_S10,R_S11);
+reg_class D6_regD(R_S12,R_S13);
+reg_class D7_regD(R_S14,R_S15);
+reg_class D0D1_regD(R_S0,R_S1,R_S2,R_S3);
+reg_class D2D3_regD(R_S4,R_S5,R_S6,R_S7);
+
+// reg_class D16_regD(R_D16,R_D16x);
+// reg_class D17_regD(R_D17,R_D17x);
+// reg_class D18_regD(R_D18,R_D18x);
+// reg_class D19_regD(R_D19,R_D19x);
+// reg_class D20_regD(R_D20,R_D20x);
+// reg_class D21_regD(R_D21,R_D21x);
+// reg_class D22_regD(R_D22,R_D22x);
+// reg_class D23_regD(R_D23,R_D23x);
+// reg_class D24_regD(R_D24,R_D24x);
+// reg_class D25_regD(R_D25,R_D25x);
+// reg_class D26_regD(R_D26,R_D26x);
+// reg_class D27_regD(R_D27,R_D27x);
+// reg_class D28_regD(R_D28,R_D28x);
+// reg_class D29_regD(R_D29,R_D29x);
+// reg_class D30_regD(R_D30,R_D30x);
+// reg_class D31_regD(R_D31,R_D31x);
+
+reg_class vectorx_reg(R_S0,R_S1,R_S2,R_S3, R_S4,R_S5,R_S6,R_S7,
+                      R_S8,R_S9,R_S10,R_S11, /* skip f14/f15 */
+                      R_S16,R_S17,R_S18,R_S19, R_S20,R_S21,R_S22,R_S23,
+                      R_S24,R_S25,R_S26,R_S27, R_S28,R_S29,R_S30,R_S31
+                      // ,
+                      // R_D16,R_D16x,R_D17,R_D17x, R_D18,R_D18x,R_D19,R_D19x,
+                      // R_D20,R_D20x,R_D21,R_D21x, R_D22,R_D22x,R_D23,R_D23x,
+                      // R_D24,R_D24x,R_D25,R_D25x, R_D26,R_D26x,R_D27,R_D27x,
+                      // R_D28,R_D28x,R_D29,R_D29x, R_D30,R_D30x,R_D31,R_D31x
+  );
+
+%}
+
+source_hpp %{
+// FIXME
+const MachRegisterNumbers R_mem_copy_lo_num = R_S14_num;
+const MachRegisterNumbers R_mem_copy_hi_num = R_S15_num;
+const FloatRegister Rmemcopy = f14;
+const MachRegisterNumbers R_hf_ret_lo_num = R_S0_num;
+const MachRegisterNumbers R_hf_ret_hi_num = R_S1_num;
+
+const MachRegisterNumbers R_Ricklass_num = R_R12_num;
+const MachRegisterNumbers R_Rmethod_num  = R_R8_num;
+
+#define LDR_DOUBLE "FLDD"
+#define LDR_FLOAT  "FLDS"
+#define STR_DOUBLE "FSTD"
+#define STR_FLOAT  "FSTS"
+#define LDR_64     "LDRD"
+#define STR_64     "STRD"
+#define LDR_32     "LDR"
+#define STR_32     "STR"
+#define MOV_DOUBLE "FCPYD"
+#define MOV_FLOAT  "FCPYS"
+#define FMSR       "FMSR"
+#define FMRS       "FMRS"
+#define LDREX      "ldrex "
+#define STREX      "strex "
+
+static inline bool is_memoryD(int offset) {
+  return offset < 1024 && offset > -1024;
+}
+
+static inline bool is_memoryfp(int offset) {
+  return offset < 1024 && offset > -1024;
+}
+
+static inline bool is_memoryI(int offset) {
+  return offset < 4096 && offset > -4096;
+}
+
+static inline bool is_memoryP(int offset) {
+  return offset < 4096 && offset > -4096;
+}
+
+static inline bool is_memoryHD(int offset) {
+  return offset < 256 && offset > -256;
+}
+
+static inline bool is_aimm(int imm) {
+  return Assembler::is_valid_for_imm12(imm);
+}
+
+static inline bool is_limmI(jint imm) {
+  return Assembler::is_valid_for_imm12(imm);
+}
+
+static inline bool is_limmI_low(jint imm, int n) {
+  int imml = imm & right_n_bits(n);
+  return is_limmI(imml) || is_limmI(imm);
+}
+
+static inline int limmI_low(jint imm, int n) {
+  int imml = imm & right_n_bits(n);
+  return is_limmI(imml) ? imml : imm;
+}
+
+%}
+
+source %{
+
+// Given a register encoding, produce a Integer Register object
+static Register reg_to_register_object(int register_encoding) {
+  assert(r0->encoding() == R_R0_enc && r15->encoding() == R_R15_enc, "right coding");
+  return as_Register(register_encoding);
+}
+
+// Given a register encoding, produce a Float Register object
+static FloatRegister reg_to_FloatRegister_object(int register_encoding) {
+  assert(f0->encoding() == R_S0_enc && f31->encoding() == R_S31_enc, "right coding");
+  // [d16,d31] share FloatRegister encoding with [f1,f31] since it numericall equals to ARM insn parameter encoding
+  // in contrary OptoReg encoding for d16+ is different
+  return as_FloatRegister((register_encoding&0x1f)|(register_encoding>>5));
+}
+
+void Compile::pd_compiler2_init() {
+  // Umimplemented
+}
+
+OptoRegPair c2::return_value(int ideal_reg) {
+  assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
+  static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, R_R0_num,     R_R0_num,     R_hf_ret_lo_num,  R_hf_ret_lo_num, R_R0_num };
+  static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad,     R_hf_ret_hi_num, R_R1_num };
+#ifndef HARD_FLOAT_CC
+  assert(hasFPU(), "non-VFP java ABI is not supported");
+#endif
+  return OptoRegPair( hi[ideal_reg], lo[ideal_reg]);
+}
+
+#ifndef HARD_FLOAT_CC
+OptoRegPair c2::c_return_value(int ideal_reg) {
+  assert( ideal_reg >= Op_RegI && ideal_reg <= Op_RegL, "only return normal values" );
+  static int lo[Op_RegL+1] = { 0, 0, OptoReg::Bad, R_R0_num,     R_R0_num,     R_R0_num,     R_R0_num, R_R0_num };
+  static int hi[Op_RegL+1] = { 0, 0, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, OptoReg::Bad, R_R1_num, R_R1_num };
+  return OptoRegPair( hi[ideal_reg], lo[ideal_reg]);
+}
+#endif
+
+// !!!!! Special hack to get all type of calls to specify the byte offset
+//       from the start of the call to the point where the return address
+//       will point.
+
+static uint call_static_enc_size(const MachCallNode *n, ciMethod *_method, bool _method_handle_invoke) {
+  int call_sz = (_method == NULL) ?
+    (maybe_far_call(n) ? 3 : 1) :
+    (far_branches() ? NativeCall::instruction_size / NativeInstruction::arm_insn_sz : 1);
+  return (call_sz + (_method_handle_invoke ? 2 : 0)) *
+    NativeInstruction::arm_insn_sz;
+}
+
+static uint call_dynamic_enc_size() {
+  return 2 * NativeInstruction::arm_insn_sz +
+    (far_branches() ? NativeCall::instruction_size : NativeInstruction::arm_insn_sz);
+}
+
+static uint call_runtime_enc_size(const MachCallNode *n) {
+  // bl or movw; movt; blx
+  bool far = maybe_far_call(n);
+  return (far ? 3 : 1) * NativeInstruction::arm_insn_sz;
+}
+
+int MachCallStaticJavaNode::ret_addr_offset() {
+  return call_static_enc_size(this, _method, _method_handle_invoke) -
+    (_method_handle_invoke ? 1 : 0) * NativeInstruction::arm_insn_sz;
+}
+
+int MachCallDynamicJavaNode::ret_addr_offset() {
+  return call_dynamic_enc_size();
+}
+
+int MachCallRuntimeNode::ret_addr_offset() {
+  return call_runtime_enc_size(this);
+}
+%}
+
+// The intptr_t operand types, defined by textual substitution.
+// (Cf. opto/type.hpp.  This lets us avoid many, many other ifdefs.)
+#define immX      immI
+#define immXRot   immIRot
+#define iRegX     iRegI
+#define aimmX     aimmI
+#define limmX     limmI
+#define immX10x2  immI10x2
+#define LShiftX   LShiftI
+#define shimmX    immU5
+
+// Compatibility interface
+#define aimmP     immPRot
+#define immIMov   immIRot
+
+#define store_RegL     iRegL
+#define store_RegLd    iRegLd
+#define store_RegI     iRegI
+#define store_ptr_RegP iRegP
+
+//----------ATTRIBUTES---------------------------------------------------------
+//----------Operand Attributes-------------------------------------------------
+op_attrib op_cost(1);          // Required cost attribute
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+//----------Simple Operands----------------------------------------------------
+// Immediate Operands
+
+operand immIRot() %{
+  predicate(Assembler::is_valid_for_imm12(n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immIRotn() %{
+  predicate(n->get_int() != 0 && Assembler::is_valid_for_imm12(~n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immIRotneg() %{
+  // if Assembler::is_valid_for_imm12() is true for this constant, it is
+  // a immIRot and an optimal instruction combination exists to handle the
+  // constant as an immIRot
+  predicate(!Assembler::is_valid_for_imm12(n->get_int()) && Assembler::is_valid_for_imm12(-n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Non-negative integer immediate that is encodable using the rotation scheme,
+// and that when expanded fits in 31 bits.
+operand immU31Rot() %{
+  predicate((0 <= n->get_int()) && Assembler::is_valid_for_imm12(n->get_int()));
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immPRot() %{
+  predicate(n->get_ptr() == 0 || (Assembler::is_valid_for_imm12(n->get_ptr()) && ((ConPNode*)n)->type()->reloc() == relocInfo::none));
+
+  match(ConP);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immLlowRot() %{
+  predicate(n->get_long() >> 32 == 0 && Assembler::is_valid_for_imm12((int)n->get_long()));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immLRot2() %{
+  predicate(Assembler::is_valid_for_imm12((int)(n->get_long() >> 32)) &&
+            Assembler::is_valid_for_imm12((int)(n->get_long())));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 12-bit - for addressing mode
+operand immI12() %{
+  predicate((-4096 < n->get_int()) && (n->get_int() < 4096));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 10-bit disp and disp+4 - for addressing float pair
+operand immI10x2() %{
+  predicate((-1024 < n->get_int()) && (n->get_int() < 1024 - 4));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 12-bit disp and disp+4 - for addressing word pair
+operand immI12x2() %{
+  predicate((-4096 < n->get_int()) && (n->get_int() < 4096 - 4));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+//----------DEFINITION BLOCK---------------------------------------------------
+// Define name --> value mappings to inform the ADLC of an integer valued name
+// Current support includes integer values in the range [0, 0x7FFFFFFF]
+// Format:
+//        int_def  <name>         ( <int_value>, <expression>);
+// Generated Code in ad_<arch>.hpp
+//        #define  <name>   (<expression>)
+//        // value == <int_value>
+// Generated code in ad_<arch>.cpp adlc_verification()
+//        assert( <name> == <int_value>, "Expect (<expression>) to equal <int_value>");
+//
+definitions %{
+// The default cost (of an ALU instruction).
+  int_def DEFAULT_COST      (    100,     100);
+  int_def HUGE_COST         (1000000, 1000000);
+
+// Memory refs are twice as expensive as run-of-the-mill.
+  int_def MEMORY_REF_COST   (    200, DEFAULT_COST * 2);
+
+// Branches are even more expensive.
+  int_def BRANCH_COST       (    300, DEFAULT_COST * 3);
+  int_def CALL_COST         (    300, DEFAULT_COST * 3);
+%}
+
+
+//----------SOURCE BLOCK-------------------------------------------------------
+// This is a block of C++ code which provides values, functions, and
+// definitions necessary in the rest of the architecture description
+source_hpp %{
+// Header information of the source block.
+// Method declarations/definitions which are used outside
+// the ad-scope can conveniently be defined here.
+//
+// To keep related declarations/definitions/uses close together,
+// we switch between source %{ }% and source_hpp %{ }% freely as needed.
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#define STOP(error) __ stop(error)
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#define STOP(error) __ block_comment(error); stop(error)
+#endif
+
+#define BIND(label) __ bind(label); BLOCK_COMMENT(#label ":")
+
+// Does destination need to be loaded in a register then passed to a
+// branch instruction?
+extern bool maybe_far_call(const CallNode *n);
+extern bool maybe_far_call(const MachCallNode *n);
+static inline bool cache_reachable() {
+  return MacroAssembler::_cache_fully_reachable();
+}
+static inline bool far_branches() {
+  return MacroAssembler::far_branches();
+}
+
+extern bool PrintOptoAssembly;
+
+class c2 {
+public:
+  static OptoRegPair return_value(int ideal_reg);
+#ifndef HARD_FLOAT_CC
+  static OptoRegPair c_return_value(int ideal_reg);
+#endif
+};
+
+class CallStubImpl {
+
+  //--------------------------------------------------------------
+  //---<  Used for optimization in Compile::Shorten_branches  >---
+  //--------------------------------------------------------------
+
+ public:
+  // Size of call trampoline stub.
+  static uint size_call_trampoline() {
+    return 0; // no call trampolines on this platform
+  }
+
+  // number of relocations needed by a call trampoline stub
+  static uint reloc_call_trampoline() {
+    return 0; // no call trampolines on this platform
+  }
+};
+
+class HandlerImpl {
+
+ public:
+
+  static int emit_exception_handler(CodeBuffer &cbuf);
+  static int emit_deopt_handler(CodeBuffer& cbuf);
+
+  static uint size_exception_handler() {
+    return ( 3 * 4 );
+  }
+
+
+  static uint size_deopt_handler() {
+    return ( 9 * 4 );
+  }
+
+};
+
+%}
+
+source %{
+#define __ _masm.
+
+static FloatRegister reg_to_FloatRegister_object(int register_encoding);
+static Register reg_to_register_object(int register_encoding);
+
+
+// ****************************************************************************
+
+// REQUIRED FUNCTIONALITY
+
+// Indicate if the safepoint node needs the polling page as an input.
+// Since ARM does not have absolute addressing, it does.
+bool SafePointNode::needs_polling_address_input() {
+  return true;
+}
+
+// emit an interrupt that is caught by the debugger (for debugging compiler)
+void emit_break(CodeBuffer &cbuf) {
+  MacroAssembler _masm(&cbuf);
+  __ bkpt(0);
+}
+
+#ifndef PRODUCT
+void MachBreakpointNode::format( PhaseRegAlloc *, outputStream *st ) const {
+  st->print("TA");
+}
+#endif
+
+void MachBreakpointNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  emit_break(cbuf);
+}
+
+uint MachBreakpointNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+
+void emit_nop(CodeBuffer &cbuf) {
+  MacroAssembler _masm(&cbuf);
+  __ nop();
+}
+
+
+void emit_call_reloc(CodeBuffer &cbuf, const MachCallNode *n, MachOper *m, RelocationHolder const& rspec) {
+  int ret_addr_offset0 = n->as_MachCall()->ret_addr_offset();
+  int call_site_offset = cbuf.insts()->mark_off();
+  MacroAssembler _masm(&cbuf);
+  __ set_inst_mark(); // needed in emit_to_interp_stub() to locate the call
+  address target = (address)m->method();
+  assert(n->as_MachCall()->entry_point() == target, "sanity");
+  assert(maybe_far_call(n) == !__ reachable_from_cache(target), "sanity");
+  assert(cache_reachable() == __ cache_fully_reachable(), "sanity");
+
+  assert(target != NULL, "need real address");
+
+  if (rspec.type() == relocInfo::runtime_call_type ||
+    rspec.type() == relocInfo::none) {
+    __ call(target, rspec);
+  } else {
+    __ trampoline_call(Address(target, rspec), NULL);
+  }
+  int ret_addr_offset = __ offset();
+  assert(ret_addr_offset - call_site_offset == ret_addr_offset0, "fix ret_addr_offset()");
+}
+
+//=============================================================================
+// REQUIRED FUNCTIONALITY for encoding
+void emit_lo(CodeBuffer &cbuf, int val) {  }
+void emit_hi(CodeBuffer &cbuf, int val) {  }
+
+
+//=============================================================================
+const RegMask& MachConstantBaseNode::_out_RegMask = PTR_REG_mask();
+
+int Compile::ConstantTable::calculate_table_base_offset() const {
+  int offset = -(size() / 2);
+  // vldr_f32, vldr_f64: 8-bit  offset multiplied by 4: +/- 1024
+  // ldr, ldrb : 12-bit offset:                 +/- 4096
+  if (!Assembler::is_simm10(offset)) {
+    offset = Assembler::min_simm10();
+  }
+  return offset;
+}
+
+bool MachConstantBaseNode::requires_postalloc_expand() const { return false; }
+void MachConstantBaseNode::postalloc_expand(GrowableArray <Node *> *nodes, PhaseRegAlloc *ra_) {
+  ShouldNotReachHere();
+}
+
+void MachConstantBaseNode::emit(CodeBuffer& cbuf, PhaseRegAlloc* ra_) const {
+  Compile* C = ra_->C;
+  Compile::ConstantTable& constant_table = C->constant_table();
+  MacroAssembler _masm(&cbuf);
+
+  Register r = as_Register(ra_->get_encode(this));
+  CodeSection* consts_section = __ code()->consts();
+  int consts_size = consts_section->align_at_start(consts_section->size());
+  assert(constant_table.size() == consts_size, "must be: %d == %d", constant_table.size(), consts_size);
+
+  // Materialize the constant table base.
+  address baseaddr = consts_section->start() + -(constant_table.table_base_offset());
+  RelocationHolder rspec = internal_word_Relocation::spec(baseaddr);
+  __ mov_address(r, baseaddr, rspec);
+}
+
+uint MachConstantBaseNode::size(PhaseRegAlloc*) const {
+  return 8;
+}
+
+#ifndef PRODUCT
+void MachConstantBaseNode::format(PhaseRegAlloc* ra_, outputStream* st) const {
+  char reg[128];
+  ra_->dump_register(this, reg);
+  st->print("MOV_SLOW    &constanttable,%s\t! constant table base", reg);
+}
+#endif
+
+#ifndef PRODUCT
+void MachPrologNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  Compile* C = ra_->C;
+
+  for (int i = 0; i < OptoPrologueNops; i++) {
+    st->print_cr("NOP"); st->print("\t");
+  }
+
+  size_t framesize = C->frame_size_in_bytes();
+  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
+  int bangsize = C->bang_size_in_bytes();
+  // Remove two words for return addr and rbp,
+  framesize -= 2*wordSize;
+  bangsize -= 2*wordSize;
+
+  // Calls to C2R adapters often do not accept exceptional returns.
+  // We require that their callers must bang for them.  But be careful, because
+  // some VM calls (such as call site linkage) can use several kilobytes of
+  // stack.  But the stack safety zone should account for that.
+  // See bugs 4446381, 4468289, 4497237.
+  if (C->need_stack_bang(bangsize)) {
+    st->print_cr("! stack bang (%d bytes)", bangsize); st->print("\t");
+  }
+  st->print_cr("PUSH   R_FP|R_LR_LR"); st->print("\t");
+  if (framesize != 0) {
+    st->print   ("SUB    R_SP, R_SP, " SIZE_FORMAT,framesize);
+  }
+}
+#endif
+
+void MachPrologNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  Compile* C = ra_->C;
+  MacroAssembler _masm(&cbuf);
+
+  // insert a nop at the start of the prolog so we can patch in a
+  // branch if we need to invalidate the method later
+  __ nop();
+
+  size_t framesize = C->frame_size_in_bytes();
+  assert((framesize & (StackAlignmentInBytes-1)) == 0, "frame size not aligned");
+  int bangsize = C->bang_size_in_bytes();
+  // Remove two words for return addr and fp,
+  framesize -= 2*wordSize;
+  bangsize -= 2*wordSize;
+
+  // Calls to C2R adapters often do not accept exceptional returns.
+  // We require that their callers must bang for them.  But be careful, because
+  // some VM calls (such as call site linkage) can use several kilobytes of
+  // stack.  But the stack safety zone should account for that.
+  // See bugs 4446381, 4468289, 4497237.
+  if (C->need_stack_bang(bangsize)) {
+    __ arm_stack_overflow_check(bangsize, r12);
+  }
+
+  __ push(RegSet::of(rfp, lr), sp);
+  if (framesize != 0) {
+    __ sub(sp, sp, framesize);
+  }
+
+  // offset from scratch buffer is not valid
+  if (strcmp(cbuf.name(), "Compile::Fill_buffer") == 0) {
+    C->set_frame_complete( __ offset() );
+  }
+
+  if (C->has_mach_constant_base_node()) {
+    // NOTE: We set the table base offset here because users might be
+    // emitted before MachConstantBaseNode.
+    Compile::ConstantTable& constant_table = C->constant_table();
+    constant_table.set_table_base_offset(constant_table.calculate_table_base_offset());
+  }
+}
+
+uint MachPrologNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+int MachPrologNode::reloc() const {
+  return 10; // a large enough number
+}
+
+//=============================================================================
+#ifndef PRODUCT
+void MachEpilogNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  Compile* C = ra_->C;
+
+  size_t framesize = C->frame_size_in_bytes();
+  framesize -= 2*wordSize;
+
+  if (framesize != 0) {
+    st->print("ADD    R_SP, R_SP, " SIZE_FORMAT "\n\t",framesize);
+  }
+  st->print("POP    R_FP|R_LR_LR");
+
+  if (do_polling() && ra_->C->is_method_compilation()) {
+    st->print("\n\t");
+    st->print("MOV    r12, #PollAddr\t! Load Polling address\n\t");
+    st->print("LDR    r12,[r12]\t!Poll for Safepointing");
+  }
+}
+#endif
+
+void MachEpilogNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  Compile* C = ra_->C;
+
+  size_t framesize = C->frame_size_in_bytes();
+  framesize -= 2*wordSize;
+  if (framesize != 0) {
+    __ add(sp, sp, framesize);
+  }
+  __ pop(RegSet::of(rfp, lr), sp);
+
+  if (StackReservedPages > 0 && C->has_reserved_stack_access()) {
+    __ reserved_stack_check();
+  }
+
+  // If this does safepoint polling, then do it here
+  if (do_polling() && ra_->C->is_method_compilation()) {
+    // mov here is usually one or two instruction
+    __ mov_address(r12, (address)os::get_polling_page(), RelocationHolder::none);
+    __ relocate(relocInfo::poll_return_type);
+    __ ldr(r12, Address(r12));
+  }
+}
+
+uint MachEpilogNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+int MachEpilogNode::reloc() const {
+  return 16; // a large enough number
+}
+
+const Pipeline * MachEpilogNode::pipeline() const {
+  return MachNode::pipeline_class();
+}
+
+int MachEpilogNode::safepoint_offset() const {
+  assert( do_polling(), "no return for this epilog node");
+  //  return MacroAssembler::size_of_sethi(os::get_polling_page());
+  Unimplemented();
+  return 0;
+}
+
+//=============================================================================
+
+// Figure out which register class each belongs in: rc_int, rc_float, rc_stack
+enum RC { rc_bad, rc_int, rc_float, rc_stack };
+static enum RC rc_class( OptoReg::Name reg ) {
+  if (!OptoReg::is_valid(reg)) return rc_bad;
+  if (OptoReg::is_stack(reg)) return rc_stack;
+  VMReg r = OptoReg::as_VMReg(reg);
+  if (r->is_Register()) return rc_int;
+  assert(r->is_FloatRegister(), "must be");
+  return rc_float;
+}
+
+static inline bool is_iRegLd_memhd(OptoReg::Name src_first, OptoReg::Name src_second, int offset) {
+  int rlo = Matcher::_regEncode[src_first];
+  int rhi = Matcher::_regEncode[src_second];
+  // if (!((rlo&1)==0 && (rlo+1 == rhi))) {
+  //   tty->print_cr("CAUGHT BAD LDRD/STRD");
+  // }
+  return (rlo&1)==0 && (rlo+1 == rhi) && is_memoryHD(offset);
+}
+
+uint MachSpillCopyNode::implementation( CodeBuffer *cbuf,
+                                        PhaseRegAlloc *ra_,
+                                        bool do_size,
+                                        outputStream* st ) const {
+  // Get registers to move
+  OptoReg::Name src_second = ra_->get_reg_second(in(1));
+  OptoReg::Name src_first = ra_->get_reg_first(in(1));
+  OptoReg::Name dst_second = ra_->get_reg_second(this );
+  OptoReg::Name dst_first = ra_->get_reg_first(this );
+
+  enum RC src_second_rc = rc_class(src_second);
+  enum RC src_first_rc = rc_class(src_first);
+  enum RC dst_second_rc = rc_class(dst_second);
+  enum RC dst_first_rc = rc_class(dst_first);
+
+  assert( OptoReg::is_valid(src_first) && OptoReg::is_valid(dst_first), "must move at least 1 register" );
+
+  // Generate spill code!
+  int size = 0;
+
+  if (src_first == dst_first && src_second == dst_second)
+    return size;            // Self copy, no move
+
+#ifdef TODO
+  if (bottom_type()->isa_vect() != NULL) {
+  }
+#endif
+
+  // Shared code does not expect instruction set capability based bailouts here.
+  // Handle offset unreachable bailout with minimal change in shared code.
+  // Bailout only for real instruction emit.
+  // This requires a single comment change in shared code. ( see output.cpp "Normal" instruction case )
+
+  MacroAssembler _masm(cbuf);
+
+  // --------------------------------------
+  // Check for mem-mem move.  Load into unused float registers and fall into
+  // the float-store case.
+  if (src_first_rc == rc_stack && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        src_first     = OptoReg::Name(R_mem_copy_lo_num);
+        src_second    = OptoReg::Name(R_mem_copy_hi_num);
+        src_first_rc  = rc_float;
+        src_second_rc = rc_float;
+        if (cbuf) {
+          __ vldr_f64(Rmemcopy, Address(sp, offset));
+        } else if (!do_size) {
+          st->print(LDR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+        }
+      } else {
+        src_first     = OptoReg::Name(R_mem_copy_lo_num);
+        src_first_rc  = rc_float;
+        if (cbuf) {
+          __ vldr_f32(Rmemcopy, Address(sp, offset));
+        } else if (!do_size) {
+          st->print(LDR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+        }
+      }
+      size += 4;
+    }
+  }
+
+  if (src_second_rc == rc_stack && dst_second_rc == rc_stack) {
+    Unimplemented();
+  }
+
+  // --------------------------------------
+  // Check for integer reg-reg copy
+  if (src_first_rc == rc_int && dst_first_rc == rc_int) {
+    // Else normal reg-reg copy
+    assert( src_second != dst_first, "smashed second before evacuating it" );
+    if (cbuf) {
+      __ mov(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      st->print("MOV    R_%s, R_%s\t# spill",
+                Matcher::regName[dst_first],
+                Matcher::regName[src_first]);
+#endif
+    }
+    size += 4;
+  }
+
+  // Check for integer store
+  if (src_first_rc == rc_int && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_first);
+    if (cbuf && !is_memoryI(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad && is_iRegLd_memhd(src_first, src_second, offset)) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        if (cbuf) {
+          __ strd(reg_to_register_object(Matcher::_regEncode[src_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_64 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first), offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ str(reg_to_register_object(Matcher::_regEncode[src_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_32 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first), offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for integer load
+  if (dst_first_rc == rc_int && src_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryI(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (src_second_rc != rc_bad && is_iRegLd_memhd(dst_first, dst_second, offset)) {
+        assert((src_first&1)==0 && src_first+1 == src_second, "pair of registers must be aligned/contiguous");
+        if (cbuf) {
+          __ ldrd(reg_to_register_object(Matcher::_regEncode[dst_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_64 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first), offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ ldr(reg_to_register_object(Matcher::_regEncode[dst_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_32 "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first), offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for float reg-reg copy
+  if (src_first_rc == rc_float && dst_first_rc == rc_float) {
+    if (src_second_rc != rc_bad) {
+      assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      if (cbuf) {
+      __ vmov_f64(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        st->print(MOV_DOUBLE "    R_%s, R_%s\t# spill",
+                  Matcher::regName[dst_first],
+                  Matcher::regName[src_first]);
+#endif
+      }
+      return 4;
+    }
+    if (cbuf) {
+      __ vmov_f32(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      st->print(MOV_FLOAT "    R_%s, R_%s\t# spill",
+                Matcher::regName[dst_first],
+                Matcher::regName[src_first]);
+#endif
+    }
+    size = 4;
+  }
+
+  // Check for float store
+  if (src_first_rc == rc_float && dst_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      // Further check for aligned-adjacent pair, so we can use a double store
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers and stack slots must be aligned/contiguous");
+        if (cbuf) {
+          __ vstr_f64(reg_to_FloatRegister_object(Matcher::_regEncode[src_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ vstr_f32(reg_to_FloatRegister_object(Matcher::_regEncode[src_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(STR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_first),offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // Check for float load
+  if (dst_first_rc == rc_float && src_first_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_first);
+    if (cbuf && !is_memoryfp(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      // Further check for aligned-adjacent pair, so we can use a double store
+      if (src_second_rc != rc_bad) {
+        assert((src_first&1)==0 && src_first+1 == src_second && (dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers and stack slots must be aligned/contiguous");
+        if (cbuf) {
+          __ vldr_f64(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_DOUBLE "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first),offset);
+#endif
+        }
+        return size + 4;
+      } else {
+        if (cbuf) {
+          __ vldr_f32(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), Address(sp, offset));
+#ifndef PRODUCT
+        } else if (!do_size) {
+          if (size != 0) st->print("\n\t");
+          st->print(LDR_FLOAT "   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_first),offset);
+#endif
+        }
+      }
+    }
+    size += 4;
+  }
+
+  // check for int reg -> float reg move
+  if (src_first_rc == rc_int && dst_first_rc == rc_float) {
+    // Further check for aligned-adjacent pair, so we can use a single instruction
+    if (src_second_rc != rc_bad) {
+      assert((dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      assert((src_first&1)==0 && src_first+1 == src_second, "pairs of registers must be aligned/contiguous");
+      assert(src_second_rc == rc_int && dst_second_rc == rc_float, "unsupported");
+      if (cbuf) {
+        __ vmov_f64(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]), reg_to_register_object(Matcher::_regEncode[src_second]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("FMDRR   R_%s, R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first), OptoReg::regname(src_second));
+#endif
+      }
+      return size + 4;
+    } else {
+      if (cbuf) {
+        __ vmov_f32(reg_to_FloatRegister_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print(FMSR "   R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#endif
+      }
+      size += 4;
+    }
+  }
+
+  // check for float reg -> int reg move
+  if (src_first_rc == rc_float && dst_first_rc == rc_int) {
+    // Further check for aligned-adjacent pair, so we can use a single instruction
+    if (src_second_rc != rc_bad) {
+      assert((src_first&1)==0 && src_first+1 == src_second, "pairs of registers must be aligned/contiguous");
+      assert((dst_first&1)==0 && dst_first+1 == dst_second, "pairs of registers must be aligned/contiguous");
+      assert(src_second_rc == rc_float && dst_second_rc == rc_int, "unsupported");
+      if (cbuf) {
+        __ vmov_f64(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_register_object(Matcher::_regEncode[dst_second]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("FMRRD   R_%s, R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(dst_second), OptoReg::regname(src_first));
+#endif
+      }
+      return size + 4;
+    } else {
+      if (cbuf) {
+        __ vmov_f32(reg_to_register_object(Matcher::_regEncode[dst_first]), reg_to_FloatRegister_object(Matcher::_regEncode[src_first]));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print(FMRS "   R_%s, R_%s\t! spill",OptoReg::regname(dst_first), OptoReg::regname(src_first));
+#endif
+      }
+      size += 4;
+    }
+  }
+
+  // --------------------------------------------------------------------
+  // Check for hi bits still needing moving.  Only happens for misaligned
+  // arguments to native calls.
+  if (src_second == dst_second)
+    return size;               // Self copy; no move
+  assert( src_second_rc != rc_bad && dst_second_rc != rc_bad, "src_second & dst_second cannot be Bad" );
+
+  // Check for integer reg-reg copy.  Hi bits are stuck up in the top
+  // 32-bits of a 64-bit register, but are needed in low bits of another
+  // register (else it's a hi-bits-to-hi-bits copy which should have
+  // happened already as part of a 64-bit move)
+  if (src_second_rc == rc_int && dst_second_rc == rc_int) {
+    if (cbuf) {
+      __ mov(reg_to_register_object(Matcher::_regEncode[dst_second]), reg_to_register_object(Matcher::_regEncode[src_second]));
+#ifndef PRODUCT
+    } else if (!do_size) {
+      if (size != 0) st->print("\n\t");
+      st->print("MOV    R_%s, R_%s\t# spill high",
+                Matcher::regName[dst_second],
+                Matcher::regName[src_second]);
+#endif
+    }
+    return size+4;
+  }
+
+  // Check for high word integer store
+  if (src_second_rc == rc_int && dst_second_rc == rc_stack) {
+    int offset = ra_->reg2offset(dst_second);
+
+    if (cbuf && !is_memoryP(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (cbuf) {
+        __ str(reg_to_register_object(Matcher::_regEncode[src_second]), Address(sp, offset));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("STR   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(src_second), offset);
+#endif
+      }
+    }
+    return size + 4;
+  }
+
+  // Check for high word integer load
+  if (dst_second_rc == rc_int && src_second_rc == rc_stack) {
+    int offset = ra_->reg2offset(src_second);
+    if (cbuf && !is_memoryP(offset)) {
+      ra_->C->record_method_not_compilable("unable to handle large constant offsets");
+      return 0;
+    } else {
+      if (cbuf) {
+        __ ldr(reg_to_register_object(Matcher::_regEncode[dst_second]), Address(sp, offset));
+#ifndef PRODUCT
+      } else if (!do_size) {
+        if (size != 0) st->print("\n\t");
+        st->print("LDR   R_%s,[R_SP + #%d]\t! spill",OptoReg::regname(dst_second), offset);
+#endif
+      }
+    }
+    return size + 4;
+  }
+
+  Unimplemented();
+  return 0; // Mute compiler
+}
+
+#ifndef PRODUCT
+void MachSpillCopyNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  implementation( NULL, ra_, false, st );
+}
+#endif
+
+void MachSpillCopyNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  implementation( &cbuf, ra_, false, NULL );
+}
+
+uint MachSpillCopyNode::size(PhaseRegAlloc *ra_) const {
+  return implementation( NULL, ra_, true, NULL );
+}
+
+//=============================================================================
+#ifndef PRODUCT
+void MachNopNode::format( PhaseRegAlloc *, outputStream *st ) const {
+  st->print("NOP \t# %d bytes pad for loops and calls", 4 * _count);
+}
+#endif
+
+void MachNopNode::emit(CodeBuffer &cbuf, PhaseRegAlloc * ) const {
+  MacroAssembler _masm(&cbuf);
+  for(int i = 0; i < _count; i += 1) {
+    __ nop();
+  }
+}
+
+uint MachNopNode::size(PhaseRegAlloc *ra_) const {
+  return 4 * _count;
+}
+
+
+//=============================================================================
+#ifndef PRODUCT
+void BoxLockNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
+  int reg = ra_->get_reg_first(this);
+  st->print("ADD    %s,R_SP+#%d",Matcher::regName[reg], offset);
+}
+#endif
+
+void BoxLockNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  int offset = ra_->reg2offset(in_RegMask(0).find_first_elem());
+  int reg = ra_->get_encode(this);
+  Register dst = reg_to_register_object(reg);
+
+  if (is_aimm(offset)) {
+    __ add(dst, sp, offset);
+  } else {
+    __ mov(dst, offset);
+    __ add(dst, sp, dst);
+  }
+}
+
+uint BoxLockNode::size(PhaseRegAlloc *ra_) const {
+  // BoxLockNode is not a MachNode, so we can't just call MachNode::size(ra_)
+  assert(ra_ == ra_->C->regalloc(), "sanity");
+  return ra_->C->scratch_emit_size(this);
+}
+
+//=============================================================================
+#ifndef PRODUCT
+#define R_RTEMP "R_R12"
+void MachUEPNode::format( PhaseRegAlloc *ra_, outputStream *st ) const {
+  st->print_cr("\nUEP:");
+  if (UseCompressedClassPointers) {
+    st->print_cr("\tLDR_w " R_RTEMP ",[R_R0 + oopDesc::klass_offset_in_bytes]\t! Inline cache check");
+    st->print_cr("\tdecode_klass " R_RTEMP);
+  } else {
+    st->print_cr("\tLDR   " R_RTEMP ",[R_R0 + oopDesc::klass_offset_in_bytes]\t! Inline cache check");
+  }
+  st->print_cr("\tCMP   " R_RTEMP ",R_R12" );
+  st->print   ("\tB.NE  SharedRuntime::handle_ic_miss_stub");
+}
+#endif
+
+void MachUEPNode::emit(CodeBuffer &cbuf, PhaseRegAlloc *ra_) const {
+  MacroAssembler _masm(&cbuf);
+  Register iCache  = reg_to_register_object(Matcher::inline_cache_reg_encode());
+  assert(iCache == rscratch2/*Ricklass*/, "should be");
+  Register receiver = r0;
+
+  __ load_klass(r9, receiver);
+  __ cmp(r9, iCache);
+  // r9 seems temporary here
+  __ jump(SharedRuntime::get_ic_miss_stub(), relocInfo::runtime_call_type, r9, Assembler::NE);
+}
+
+uint MachUEPNode::size(PhaseRegAlloc *ra_) const {
+  return MachNode::size(ra_);
+}
+
+
+// REQUIRED EMIT CODE
+
+//=============================================================================
+
+// Emit exception handler code.
+int HandlerImpl::emit_exception_handler(CodeBuffer& cbuf) {
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(size_exception_handler());
+  if (base == NULL) {
+    ciEnv::current()->record_failure("CodeCache is full");
+    return 0;  // CodeBuffer::expand failed
+  }
+
+  int offset = __ offset();
+
+  // OK to trash LR, because exception blob will kill it
+  __ jump(OptoRuntime::exception_blob()->entry_point(), relocInfo::runtime_call_type, lr);
+
+  assert(__ offset() - offset <= (int) size_exception_handler(), "overflow");
+
+  __ end_a_stub();
+
+  return offset;
+}
+
+int HandlerImpl::emit_deopt_handler(CodeBuffer& cbuf) {
+  // Can't use any of the current frame's registers as we may have deopted
+  // at a poll and everything can be live.
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(size_deopt_handler());
+  if (base == NULL) {
+    ciEnv::current()->record_failure("CodeCache is full");
+    return 0;  // CodeBuffer::expand failed
+  }
+
+  int offset = __ offset();
+  address deopt_pc = __ pc();
+
+  __ sub(sp, sp, wordSize); // make room for saved PC
+  __ push(lr); // save LR that may be live when we get here
+  __ mov_relative_address(lr, deopt_pc);
+  __ str(lr, Address(sp, wordSize)); // save deopt PC
+  __ pop(lr); // restore LR
+  // rscratch1 seems killed  at deopt_blob
+  __ jump(SharedRuntime::deopt_blob()->unpack(), relocInfo::runtime_call_type, rscratch1);
+
+  assert(__ offset() - offset <= (int) size_deopt_handler(), "overflow");
+
+  __ end_a_stub();
+  return offset;
+}
+
+// REQUIRED MATCHER CODE
+
+//=============================================================================
+
+const bool Matcher::match_rule_supported(int opcode) {
+  if (!has_match_rule(opcode))
+    return false;
+
+  switch (opcode) {
+  case Op_PopCountI:
+  case Op_PopCountL:
+    if (!UsePopCountInstruction)
+      return false;
+    break;
+  case Op_LShiftCntV:
+  case Op_RShiftCntV:
+  case Op_AddVB:
+  case Op_AddVS:
+  case Op_AddVI:
+  case Op_AddVL:
+  case Op_SubVB:
+  case Op_SubVS:
+  case Op_SubVI:
+  case Op_SubVL:
+  case Op_MulVS:
+  case Op_MulVI:
+  case Op_LShiftVB:
+  case Op_LShiftVS:
+  case Op_LShiftVI:
+  case Op_LShiftVL:
+  case Op_RShiftVB:
+  case Op_RShiftVS:
+  case Op_RShiftVI:
+  case Op_RShiftVL:
+  case Op_URShiftVB:
+  case Op_URShiftVS:
+  case Op_URShiftVI:
+  case Op_URShiftVL:
+  case Op_AndV:
+  case Op_OrV:
+  case Op_XorV:
+    return VM_Version::features() & FT_AdvSIMD;
+  case Op_LoadVector:
+  case Op_StoreVector:
+  case Op_AddVF:
+  case Op_SubVF:
+  case Op_MulVF:
+    return VM_Version::features() & (FT_VFPV2 | FT_AdvSIMD);
+  case Op_AddVD:
+  case Op_SubVD:
+  case Op_MulVD:
+  case Op_DivVF:
+  case Op_DivVD:
+    return VM_Version::features() & FT_VFPV2;
+  }
+
+  return true;  // Per default match rules are supported.
+}
+
+const bool Matcher::match_rule_supported_vector(int opcode, int vlen) {
+
+  // TODO
+  // identify extra cases that we might want to provide match rules for
+  // e.g. Op_ vector nodes and other intrinsics while guarding with vlen
+  bool ret_value = match_rule_supported(opcode);
+  // Add rules here.
+
+  return ret_value;  // Per default match rules are supported.
+}
+
+const bool Matcher::has_predicated_vectors(void) {
+  return false;
+}
+
+const int Matcher::float_pressure(int default_pressure_threshold) {
+  return default_pressure_threshold;
+}
+
+int Matcher::regnum_to_fpu_offset(int regnum) {
+  return regnum - 32; // The FP registers are in the second chunk
+}
+
+// Vector width in bytes
+const int Matcher::vector_width_in_bytes(BasicType bt) {
+  return MaxVectorSize;
+}
+
+// Vector ideal reg corresponding to specified size in bytes
+const uint Matcher::vector_ideal_reg(int size) {
+  assert(MaxVectorSize >= size, "");
+  switch(size) {
+    case  8: return Op_VecD;
+    case 16: return Op_VecX;
+  }
+  ShouldNotReachHere();
+  return 0;
+}
+
+const uint Matcher::vector_shift_count_ideal_reg(int size) {
+  return vector_ideal_reg(size);
+}
+
+// Limits on vector size (number of elements) loaded into vector.
+const int Matcher::max_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return vector_width_in_bytes(bt)/type2aelembytes(bt);
+}
+
+const int Matcher::min_vector_size(const BasicType bt) {
+  assert(is_java_primitive(bt), "only primitive type vectors");
+  return 8/type2aelembytes(bt);
+}
+
+// ARM doesn't support misaligned vectors store/load.
+const bool Matcher::misaligned_vectors_ok() {
+  return false;
+}
+
+// ARM doesn't support AES intrinsics
+const bool Matcher::pass_original_key_for_aes() {
+  return false;
+}
+
+const bool Matcher::convL2FSupported(void) {
+  return false; // TODO why not?
+}
+
+// Is this branch offset short enough that a short branch can be used?
+//
+// NOTE: If the platform does not provide any short branch variants, then
+//       this method should return false for offset 0.
+bool Matcher::is_short_branch_offset(int rule, int br_size, int offset) {
+  // The passed offset is relative to address of the branch.
+  // On ARM a branch displacement is calculated relative to address
+  // of the branch + 8.
+  //
+  // offset -= 8;
+  // return (Assembler::is_simm24(offset));
+  return false;
+}
+
+const bool Matcher::isSimpleConstant64(jlong value) {
+  // Will one (StoreL ConL) be cheaper than two (StoreI ConI)?.
+  return false;
+}
+
+// No scaling for the parameter the ClearArray node.
+const bool Matcher::init_array_count_is_in_bytes = true;
+
+// Needs 2 CMOV's for longs.
+const int Matcher::long_cmove_cost() { return 2; }
+
+// CMOVF/CMOVD are expensive on ARM.
+const int Matcher::float_cmove_cost() { return ConditionalMoveLimit; }
+
+// Does the CPU require late expand (see block.cpp for description of late expand)?
+const bool Matcher::require_postalloc_expand = false;
+
+// Do we need to mask the count passed to shift instructions or does
+// the cpu only look at the lower 5/6 bits anyway?
+// FIXME: does this handle vector shifts as well?
+const bool Matcher::need_masked_shift_count = true;
+
+const bool Matcher::convi2l_type_required = true;
+
+// Should the Matcher clone shifts on addressing modes, expecting them
+// to be subsumed into complex addressing expressions or compute them
+// into registers?
+bool Matcher::clone_address_expressions(AddPNode* m, Matcher::MStack& mstack, VectorSet& address_visited) {
+  return clone_base_plus_offset_address(m, mstack, address_visited);
+}
+
+void Compile::reshape_address(AddPNode* addp) {
+}
+
+bool Matcher::narrow_oop_use_complex_address() {
+  ShouldNotCallThis();
+  return false;
+}
+
+bool Matcher::narrow_klass_use_complex_address() {
+  ShouldNotCallThis();
+  return false;
+}
+
+bool Matcher::const_oop_prefer_decode() {
+  ShouldNotCallThis();
+  return true;
+}
+
+bool Matcher::const_klass_prefer_decode() {
+  ShouldNotCallThis();
+  return true;
+}
+
+// Is it better to copy float constants, or load them directly from memory?
+// Intel can load a float constant from a direct address, requiring no
+// extra registers.  Most RISCs will have to materialize an address into a
+// register first, so they would do better to copy the constant from stack.
+const bool Matcher::rematerialize_float_constants = false;
+
+// If CPU can load and store mis-aligned doubles directly then no fixup is
+// needed.  Else we split the double into 2 integer pieces and move it
+// piece-by-piece.  Only happens when passing doubles into C code as the
+// Java calling convention forces doubles to be aligned.
+const bool Matcher::misaligned_doubles_ok = false;
+
+// No-op on ARM.
+void Matcher::pd_implicit_null_fixup(MachNode *node, uint idx) {
+}
+
+// Advertise here if the CPU requires explicit rounding operations
+// to implement the UseStrictFP mode.
+const bool Matcher::strict_fp_requires_explicit_rounding = false;
+
+// Are floats converted to double when stored to stack during deoptimization?
+// ARM does not handle callee-save floats.
+bool Matcher::float_in_double() {
+  return false;
+}
+
+// Do ints take an entire long register or just half?
+// Note that we if-def off of _LP64.
+// The relevant question is how the int is callee-saved.  In _LP64
+// the whole long is written but de-opt'ing will have to extract
+// the relevant 32 bits, in not-_LP64 only the low 32 bits is written.
+const bool Matcher::int_in_long = false;
+
+// Return whether or not this register is ever used as an argument.  This
+// function is used on startup to build the trampoline stubs in generateOptoStub.
+// Registers not mentioned will be killed by the VM call in the trampoline, and
+// arguments in those registers not be available to the callee.
+bool Matcher::can_be_java_arg( int reg ) {
+  if (reg == R_R0_num ||
+      reg == R_R1_num ||
+      reg == R_R2_num ||
+      reg == R_R3_num) return true;
+
+  if (reg >= R_S0_num &&
+      reg <= R_S15_num) return true;
+  return false;
+}
+
+bool Matcher::is_spillable_arg( int reg ) {
+  return can_be_java_arg(reg);
+}
+
+bool Matcher::use_asm_for_ldiv_by_con( jlong divisor ) {
+  return false;
+}
+
+// Register for DIVI projection of divmodI
+RegMask Matcher::divI_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for MODI projection of divmodI
+RegMask Matcher::modI_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for DIVL projection of divmodL
+RegMask Matcher::divL_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+// Register for MODL projection of divmodL
+RegMask Matcher::modL_proj_mask() {
+  ShouldNotReachHere();
+  return RegMask();
+}
+
+const RegMask Matcher::method_handle_invoke_SP_save_mask() {
+  return FP_REGP_mask();
+}
+
+bool maybe_far_call(const CallNode *n) {
+  return !MacroAssembler::_reachable_from_cache(n->as_Call()->entry_point());
+}
+
+bool maybe_far_call(const MachCallNode *n) {
+  return !MacroAssembler::_reachable_from_cache(n->as_MachCall()->entry_point());
+}
+
+%}
+
+//----------ENCODING BLOCK-----------------------------------------------------
+// This block specifies the encoding classes used by the compiler to output
+// byte streams.  Encoding classes are parameterized macros used by
+// Machine Instruction Nodes in order to generate the bit encoding of the
+// instruction.  Operands specify their base encoding interface with the
+// interface keyword.  There are currently supported four interfaces,
+// REG_INTER, CONST_INTER, MEMORY_INTER, & COND_INTER.  REG_INTER causes an
+// operand to generate a function which returns its register number when
+// queried.   CONST_INTER causes an operand to generate a function which
+// returns the value of the constant when queried.  MEMORY_INTER causes an
+// operand to generate four functions which return the Base Register, the
+// Index Register, the Scale Value, and the Offset Value of the operand when
+// queried.  COND_INTER causes an operand to generate six functions which
+// return the encoding code (ie - encoding bits for the instruction)
+// associated with each basic boolean condition for a conditional instruction.
+//
+// Instructions specify two basic values for encoding.  Again, a function
+// is available to check if the constant displacement is an oop. They use the
+// ins_encode keyword to specify their encoding classes (which must be
+// a sequence of enc_class names, and their parameters, specified in
+// the encoding block), and they use the
+// opcode keyword to specify, in order, their primary, secondary, and
+// tertiary opcode.  Only the opcode sections which a particular instruction
+// needs for encoding need to be specified.
+encode %{
+  enc_class call_epilog %{
+    // nothing
+  %}
+
+  enc_class Java_To_Runtime (method meth) %{
+    // CALL directly to the runtime
+    emit_call_reloc(cbuf, as_MachCall(), $meth, runtime_call_Relocation::spec());
+  %}
+
+  enc_class Java_Static_Call (method meth) %{
+    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
+    // who we intended to call.
+
+    if ( !_method) {
+      emit_call_reloc(cbuf, as_MachCall(), $meth, runtime_call_Relocation::spec());
+    } else {
+      int method_index = resolved_method_index(cbuf);
+      RelocationHolder rspec = _optimized_virtual ? opt_virtual_call_Relocation::spec(method_index)
+                                                  : static_call_Relocation::spec(method_index);
+      emit_call_reloc(cbuf, as_MachCall(), $meth, rspec);
+
+      // Emit stubs for static call.
+      address stub = CompiledStaticCall::emit_to_interp_stub(cbuf);
+      if (stub == NULL) {
+        ciEnv::current()->record_failure("CodeCache is full");
+        return;
+      }
+    }
+  %}
+
+  enc_class save_last_PC %{
+    // preserve mark
+    address mark = cbuf.insts()->mark();
+    debug_only(int off0 = cbuf.insts_size());
+    MacroAssembler _masm(&cbuf);
+    int ret_addr_offset = as_MachCall()->ret_addr_offset();
+    __ adr(lr, mark + ret_addr_offset);
+    __ str(lr, Address(Rthread, JavaThread::last_Java_pc_offset()));
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == 2 * Assembler::InstructionSize, "correct size prediction");
+    // restore mark
+    cbuf.insts()->set_mark(mark);
+  %}
+
+  enc_class preserve_SP %{
+    // preserve mark
+    address mark = cbuf.insts()->mark();
+    debug_only(int off0 = cbuf.insts_size());
+    MacroAssembler _masm(&cbuf);
+    // FP is preserved across all calls, even compiled calls.
+    // Use it to preserve SP in places where the callee might change the SP.
+    __ mov(Rmh_SP_save, sp);
+    debug_only(int off1 = cbuf.insts_size());
+    assert(off1 - off0 == 4, "correct size prediction");
+    // restore mark
+    cbuf.insts()->set_mark(mark);
+  %}
+
+  enc_class restore_SP %{
+    MacroAssembler _masm(&cbuf);
+    __ mov(sp, Rmh_SP_save);
+  %}
+
+  enc_class Java_Dynamic_Call (method meth) %{
+    MacroAssembler _masm(&cbuf);
+    Register R12_ic_reg = reg_to_register_object(Matcher::inline_cache_reg_encode());
+    assert(R12_ic_reg == rscratch2/*Ricklass*/, "should be");
+    __ set_inst_mark();
+    __ movw_i(R12_ic_reg, ((unsigned int)Universe::non_oop_word()) & 0xffff);
+    __ movt_i(R12_ic_reg, ((unsigned int)Universe::non_oop_word()) >> 16);
+    address  virtual_call_oop_addr = __ inst_mark();
+    // CALL to fixup routine.  Fixup routine uses ScopeDesc info to determine
+    // who we intended to call.
+    int method_index = resolved_method_index(cbuf);
+    emit_call_reloc(cbuf, as_MachCall(), $meth, virtual_call_Relocation::spec(virtual_call_oop_addr, method_index));
+  %}
+
+  enc_class LdReplImmI(immI src, regD dst, iRegI tmp, int cnt, int wth) %{
+    // FIXME: load from constant table?
+    // Load a constant replicated "count" times with width "width"
+    int count = $cnt$$constant;
+    int width = $wth$$constant;
+    assert(count*width == 4, "sanity");
+    int val = $src$$constant;
+    if (width < 4) {
+      int bit_width = width * 8;
+      val &= (((int)1) << bit_width) - 1; // mask off sign bits
+      for (int i = 0; i < count - 1; i++) {
+        val |= (val << bit_width);
+      }
+    }
+    MacroAssembler _masm(&cbuf);
+
+    if (val == -1) {
+      __ mvn_i($tmp$$Register, 0);
+    } else if (val == 0) {
+      __ mov_i($tmp$$Register, 0);
+    } else {
+      __ movw_i($tmp$$Register, val & 0xffff);
+      __ movt_i($tmp$$Register, (unsigned int)val >> 16);
+    }
+    __ vmov_f64($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+
+  enc_class LdReplImmF(immF src, regD dst, iRegI tmp) %{
+    // Replicate float con 2 times and pack into vector (8 bytes) in regD.
+    float fval = $src$$constant;
+    int val = *((int*)&fval);
+    MacroAssembler _masm(&cbuf);
+
+    if (val == -1) {
+      __ mvn_i($tmp$$Register, 0);
+    } else if (val == 0) {
+      __ mov_i($tmp$$Register, 0);
+    } else {
+      __ movw_i($tmp$$Register, val & 0xffff);
+      __ movt_i($tmp$$Register, (unsigned int)val >> 16);
+    }
+    __ vmov_f64($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+
+  enc_class enc_String_Compare(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                               iRegI tmp1, iRegI tmp2, Q0_regD ftmp1, Q1_regD ftmp2,
+                               int bytes_per_char1, int bytes_per_char2) %{
+    MacroAssembler _masm(&cbuf);
+
+    Register       str1 = $str1$$Register;
+    Register       str2 = $str2$$Register;
+    Register       cnt1 = $cnt1$$Register;
+    Register       cnt2 = $cnt2$$Register;
+    Register       tmp1 = $tmp1$$Register;
+    Register       tmp2 = $tmp2$$Register;
+    FloatRegister ftmp1 = $ftmp1$$FloatRegister;
+    FloatRegister ftmp2 = $ftmp2$$FloatRegister;
+    Register     result = $result$$Register;
+    int bytes_per_char1 = $bytes_per_char1;
+    int bytes_per_char2 = $bytes_per_char2;
+
+    typedef void (Assembler::*ldfp)(Register, const Address &, Assembler::Condition);
+    typedef void (Assembler::*usubp)(Register, Register, Register, Assembler::Condition);
+    ldfp ldf_16 = &Assembler::ldrh;
+    ldfp ldf_8  = &Assembler::ldrb;
+
+    // slow path: single char load
+    int cnt_per_char = bytes_per_char1==2 && bytes_per_char2==2 ? 2 : 1;
+    ldfp lds1 = bytes_per_char1 == 2 ? ldf_16 : ldf_8;
+    ldfp lds2 = bytes_per_char2 == 2 ? ldf_16 : ldf_8;
+    usubp usub = bytes_per_char1 == 1 ? (usubp)&Assembler::usub8 : (usubp)&Assembler::usub16;
+
+    assert_different_registers(str1, str2, cnt1, cnt2, tmp1, tmp2, result);
+
+    Label Llength_diff, Ldone, Lshort_loop;
+
+    BLOCK_COMMENT("string_compare {");
+
+    // for UU we count bytes (saves 1 insn) for others count in chars
+    if (cnt_per_char == 1 && bytes_per_char1 == 2)
+      __ lsr(cnt1, cnt1, 1);
+    if (cnt_per_char == 1 && bytes_per_char2 == 2)
+      __ lsr(cnt2, cnt2, 1);
+
+    // Compute the minimum of the string lengths and save the difference.
+    __ subs(tmp1, cnt1, cnt2);
+    __ mov(cnt2, cnt1, Assembler::LE); // min
+
+    // Check if the strings start at the same location.
+    __ cmp(str1, str2);
+    __ b(Llength_diff, Assembler::EQ);
+
+    // without NEON only for UU and LL fast path is available
+    if ((VM_Version::features() & FT_AdvSIMD) || bytes_per_char1 == bytes_per_char2) {
+      Label Lshort_string, Lnext_word, Ldifference;
+
+      // A very short string
+      __ cmp(cnt2, 8+4);
+      __ b(Lshort_string, Assembler::LT);
+
+      // Compare words
+      {
+        const int bits_per_char = bytes_per_char1==1 && bytes_per_char2==1 ? 8 : 16;
+        // Check first few chars to avoid excessive processing
+        if (bytes_per_char1 == 1 && bytes_per_char2 == 1) {
+          Label Lfull_speed;
+          __ ldr(tmp2, __ post(str1, wordSize));
+          __ ldr(result, __ post(str2, wordSize));
+          (_masm.*usub)(result, tmp2, result, Assembler::AL);
+          __ tst(result, result);
+          __ b(Lfull_speed, Assembler::EQ);
+
+          __ rbit(cnt1, result);
+          __ clz(cnt1, cnt1);
+          __ bic(cnt1, cnt1, bits_per_char-1);
+          __ lsr(result, result, cnt1);
+          __ lsr(tmp2, tmp2, cnt1);
+          __ ubfx(result, result, 0, bits_per_char);
+          __ ubfx(tmp2, tmp2, 0, bits_per_char);
+          __ cmp(result, tmp2);
+          __ sub(result, result, 1<<bits_per_char, Assembler::HI);
+          __ b(Ldone);
+
+          __ bind(Lfull_speed);
+        } else {
+          (_masm.*lds1)(result, __ post(str1, bytes_per_char1), Assembler::AL);
+          (_masm.*lds2)(cnt1, __ post(str2, bytes_per_char2), Assembler::AL);
+          __ subs(result, result, cnt1);
+          __ b(Ldone, Assembler::NE);
+          (_masm.*lds1)(result, __ post(str1, bytes_per_char1), Assembler::AL);
+          (_masm.*lds2)(cnt1, __ post(str2, bytes_per_char2), Assembler::AL);
+          __ subs(result, result, cnt1);
+          __ b(Ldone, Assembler::NE);
+        }
+
+        if (VM_Version::features() & FT_AdvSIMD) {
+#define LD(expand_needed,reg,str)        \
+          if (!(expand_needed))                                  \
+            __ vld1_64((reg), __ post(str, 8), Assembler::ALIGN_STD);     \
+          else { \
+            __ vld1_32((reg), 0, __ post(str, 4), false);         \
+            __ vmovl_8u((reg), (reg)); /* kills reg+1 */        \
+          }
+          const int cnt_per_LD  = bytes_per_char1==bytes_per_char2 ? 8 : 4;
+          const bool expand_needed1 = bytes_per_char1==1 && bytes_per_char2==2;
+          const bool expand_needed2 = bytes_per_char1==2 && bytes_per_char2==1;
+
+          __ sub(cnt2, cnt2, 2*cnt_per_char*(16/bits_per_char) + cnt_per_LD); // 4 chars processed above for LL, 2 for the rest of encodings
+          __ bind(Lnext_word);
+          LD(expand_needed1,ftmp1,str1);
+          LD(expand_needed2,ftmp2,str2);
+          if (bits_per_char == 8)
+            __ vsub_64_8(ftmp2, ftmp1, ftmp2);
+          else
+            __ vsub_64_16(ftmp2, ftmp1, ftmp2);
+          __ vmov_f64(result, cnt1, ftmp2);
+          __ orrs(tmp2, result, cnt1);
+          __ b(Ldifference, Assembler::NE);
+          __ subs(cnt2, cnt2, cnt_per_LD);
+          __ b(Lnext_word, Assembler::HS);
+
+          // check the tail
+          __ adds(cnt2, cnt2, cnt_per_LD);
+          __ b(Llength_diff, Assembler::EQ);
+          __ b(Lshort_loop);
+
+          __ bind(Ldifference);
+          __ vmov_f64(tmp2, cnt2, ftmp1);
+          __ tst(result, result);
+          __ mov(result, cnt1, Assembler::EQ);
+          __ mov(tmp2, cnt2, Assembler::EQ);
+        } else {
+          __ sub(cnt2, cnt2, 2*cnt_per_char*(16/bits_per_char)+4); // Skip 4 or 2 chars processed above. The last word is a special case
+
+          // Move both string pointers to the last word of their
+          // strings, negate the remaining count.
+          __ lea(str1, Address(str1, cnt2));
+          __ lea(str2, Address(str2, cnt2));
+          __ neg(cnt2, cnt2);
+
+          // Loop, loading words and comparing them into tmp2.
+          __ bind(Lnext_word);
+          __ ldr(tmp2, Address(str1, cnt2));
+          __ ldr(result, Address(str2, cnt2));
+          __ teq(result, tmp2);
+          __ b(Ldifference, Assembler::NE);
+          __ adds(cnt2, cnt2, wordSize); // cnt is per-byte for both UU and LL
+          __ b(Lnext_word, Assembler::LT);
+
+          // Last word.  In the case where length == 2 we compare the
+          // same word twice, but that's still faster than another
+          // conditional branch.
+
+          __ ldr(tmp2, Address(str1));
+          __ ldr(result, Address(str2));
+          __ teq(result, tmp2);
+          __ b(Llength_diff, Assembler::EQ);
+
+          // Find the first different characters in the words and
+          // compute their difference.
+          __ bind(Ldifference);
+          (_masm.*usub)(result, tmp2, result, Assembler::AL);
+        }
+
+        // now result is a-b and tmp2 is a
+        if (bits_per_char == 8) {
+          __ rbit(cnt1, result);
+          __ clz(cnt1, cnt1);
+          __ bic(cnt1, cnt1, bits_per_char-1);
+          __ lsr(result, result, cnt1);
+          __ lsr(tmp2, tmp2, cnt1);
+          __ ubfx(result, result, 0, bits_per_char);
+          __ ubfx(tmp2, tmp2, 0, bits_per_char);
+        } else {
+          __ lsls(cnt1, result, 16);
+          __ uxth(result, result, ror(16), Assembler::EQ);
+          __ uxth(result, result, ror(), Assembler::NE);
+          __ uxth(tmp2, tmp2, ror(16), Assembler::EQ);
+          __ uxth(tmp2, tmp2, ror(), Assembler::NE);
+        }
+        __ cmp(result, tmp2);
+        __ sub(result, result, 1<<bits_per_char, Assembler::HI);
+
+        __ b(Ldone);
+      }
+
+      __ bind(Lshort_string);
+    }
+
+    // Is the minimum length zero?
+    __ cbz(cnt2, Llength_diff);
+
+    __ bind(Lshort_loop);
+    (_masm.*lds1)(result, __ post(str1, bytes_per_char1), Assembler::AL);
+    (_masm.*lds2)(cnt1, __ post(str2, bytes_per_char2), Assembler::AL);
+    __ subs(result, result, cnt1);
+    __ b(Ldone, Assembler::NE);
+    __ subs(cnt2, cnt2, cnt_per_char);
+    __ b(Lshort_loop, Assembler::NE);
+
+    // Strings are equal up to min length.  Return the length difference.
+    __ bind(Llength_diff);
+    __ asr(result, tmp1, cnt_per_char-1); // input in bytes, result in chars, nice convention
+
+    // That's it
+    __ bind(Ldone);
+
+    BLOCK_COMMENT("} string_compare");
+  %}
+
+  enc_class enc_Array_Equals(R0RegP ary1, R1RegP ary2, iRegI cnt, iRegI tmp2, iRegI result, int elemSize, bool isArray) %{
+    Label Ldone, Lloop, Lset_result, Lshort_array, Lnext_word, Lshort_array_cont, Lone_byte;
+    MacroAssembler _masm(&cbuf);
+
+    Register   ary1 = $ary1$$Register;
+    Register   ary2 = $ary2$$Register;
+    Register   cnt =  $cnt$$Register;
+    Register   tmp2 = $tmp2$$Register;
+    Register result = $result$$Register;
+    int elemSize    = $elemSize$$constant;
+    bool isArray    = $isArray$$constant;
+
+    assert_different_registers(ary1, ary2, cnt, tmp2, result);
+
+    if (isArray) {
+      int length_offset  = arrayOopDesc::length_offset_in_bytes();
+      int base_offset    = arrayOopDesc::base_offset_in_bytes(T_CHAR);
+
+      BLOCK_COMMENT(elemSize == 2 ? "char_array_equalsUU {" : "char_array_equalsLL {");
+
+      // return true if the same array
+      __ cmpoop(ary1, ary2);
+      __ b(Lset_result, Assembler::EQ); // equal
+
+      __ ands(result, ary1, ary1);
+      __ b(Ldone, Assembler::EQ);    // not equal
+
+      __ ands(result, ary2, ary2);
+      __ b(Ldone, Assembler::EQ);    // not equal
+
+      //load the lengths of arrays
+      __ ldr(cnt, Address(ary1, length_offset));
+      __ ldr(tmp2, Address(ary2, length_offset));
+
+      // return false if the two arrays are not equal length
+      __ teq(cnt, tmp2);
+      __ b(Lset_result, Assembler::NE);    // not equal
+
+      __ tst(cnt, cnt);
+      __ b(Lset_result, Assembler::EQ);    // zero-length arrays are equal
+
+      // load array addresses
+      __ add(ary1, ary1, base_offset);
+      __ add(ary2, ary2, base_offset);
+    } else {
+    // Check if the strings start at the same location.
+      BLOCK_COMMENT(elemSize == 2 ? "string_equalsUU {" : "string_equalsLL {");
+
+      __ cmp(ary1, ary2);
+      __ b(Lset_result, Assembler::EQ);
+    }
+
+    __ cmp(cnt, 4*(2/elemSize));
+    __ b(Lshort_array, Assembler::LT);
+
+    {
+      // Move both string pointers to the last word of their
+      // strings, negate the remaining count, and convert it to bytes if needed.
+      if (isArray && elemSize == 2)
+          __ lsl(cnt, cnt, 1);
+      __ sub(cnt, cnt, wordSize); // The last word is a special case
+
+      __ lea(ary1, Address(ary1, cnt));
+      __ lea(ary2, Address(ary2, cnt));
+      __ neg(cnt, cnt);
+
+      // Loop, loading words and comparing them.
+      __ bind(Lnext_word);
+      __ ldr(result, Address(ary1, cnt));
+      __ ldr(tmp2, Address(ary2, cnt));
+      __ cmp(result, tmp2);
+      __ b(Lset_result, Assembler::NE);
+      __ adds(cnt, cnt, wordSize);
+      __ b(Lnext_word, Assembler::LT);
+
+      // Last word.  In the case where length < 4 we compare the
+      // same bytes twice, but that's still faster than another
+      // conditional branch.
+      __ ldr(result, Address(ary1));
+      __ ldr(tmp2, Address(ary2));
+      __ cmp(result, tmp2);
+      __ b(Lset_result);
+    }
+
+    __ bind(Lshort_array); {
+      if (!isArray) {
+        __ tst(cnt, cnt);
+        __ b(Lset_result, Assembler::EQ);
+      }
+
+      if (elemSize == 1) {
+        __ subs(cnt, cnt, 1);
+        __ b(Lone_byte, Assembler::EQ);
+      }
+      __ bind(Lshort_array_cont);
+
+      __ ldrh(result, __ post(ary1, 2));
+      __ ldrh(tmp2, __ post(ary2, 2));
+      __ cmp(result, tmp2);
+      __ b(Lset_result, Assembler::NE);
+      __ subs(cnt, cnt, isArray ? 2/elemSize : 2);
+      __ b(Lshort_array_cont, Assembler::GT);
+    }
+
+    if (elemSize == 1) {
+      __ cmn(cnt, 1);
+      __ b(Lset_result, Assembler::EQ);
+
+      __ bind(Lone_byte); {
+        __ ldrb(result, Address(ary1));
+        __ ldrb(tmp2, Address(ary2));
+        __ cmp(result, tmp2);
+      }
+    }
+
+    __ bind(Lset_result);
+    __ mov(result, 1, Assembler::EQ);
+    __ mov(result, 0, Assembler::NE);
+
+    __ bind(Ldone);
+
+    if (isArray)
+      BLOCK_COMMENT(elemSize == 2 ? "} char_array_equalsUU" : "} char_array_equalsLL");
+    else
+      BLOCK_COMMENT(elemSize == 2? "} string_equalsUU" : "} string_equalsLL");
+
+    %}
+
+  enc_class enc_Char_Array_Compress(R2RegP src, R1RegP dst, R3RegI len, R9RegI tmp1,
+                                    Q0_regD tmp2, Q1_regD tmp3, R12RegI tmp4,
+                                    R0RegI result, flagsReg ccr) %{
+    Label Ldone, Lloop1, Lset_result;
+    MacroAssembler _masm(&cbuf);
+
+    Register      src    = $src$$Register;
+    Register      dst    = $dst$$Register;
+    Register      len    = $len$$Register;
+    Register      tmp1   = $tmp1$$Register;
+    Register      result = $result$$Register;
+    // tmp2, tmp3 and tmp4 are consumed by NEON stub
+
+    BLOCK_COMMENT("char_array_compress {");
+
+    __ movs(result, len);
+    __ b(Ldone, Assembler::EQ);
+
+    if (VM_Version::features() & FT_AdvSIMD) {
+      Label Lloop2;
+      __ cmp(len, 2+8+16); // neon stub consumes minimum 24 chars
+      __ b(Lloop1, Assembler::LO);
+
+      // check first 2 chars in hope they quickly give information about encoding
+      __ ldrh(tmp1, __ post(src, 2));
+      __ strb(tmp1, __ post(dst, 1));
+      __ lsrs(tmp1, tmp1, 8);
+      __ ldrh(tmp1, __ post(src, 2), Assembler::EQ);
+      __ strb(tmp1, __ post(dst, 1), Assembler::EQ);
+      __ lsrs(tmp1, tmp1, 8, Assembler::EQ);
+      __ b(Lset_result, Assembler::NE);
+      __ sub(len, len, 2);
+
+      __ call(StubRoutines::aarch32::string_compress_neon());
+      __ b(Ldone, Assembler::EQ);
+    }
+
+    // nothing better we could do with Aarch32 basic instruction set
+    __ bind(Lloop1); {
+      __ ldrh(tmp1, __ post(src, 2));
+      __ strb(tmp1, __ post(dst, 1));
+      __ rsbs(tmp1, tmp1, 0x100); // GT good, LE bad
+      __ subs(len, len, 1, Assembler::GT);
+      __ b(Lloop1, Assembler::GT);
+    }
+
+    __ cmp(len, 0);
+    __ bind(Lset_result);
+    __ mov(result, 0, Assembler::NE);
+
+    __ bind(Ldone);
+    BLOCK_COMMENT("} char_array_compress");
+    %}
+
+  enc_class enc_Byte_Array_Inflate(R0RegP src, R1RegP dst, R2RegI len,
+                                   iRegI tmp1, Q0_regD tmp2, flagsReg ccr) %{
+    Label Ldone, Lloop1, Lone_char;
+    MacroAssembler _masm(&cbuf);
+
+    Register      src = $src$$Register;
+    Register      dst = $dst$$Register;
+    Register      len = $len$$Register;
+    Register     tmp1 = $tmp1$$Register;
+    // tmp2 is consumed by NEON stub
+
+    BLOCK_COMMENT("byte_array_inflate {");
+
+    __ cbz(len, Ldone);
+    if (VM_Version::features() & FT_AdvSIMD) {
+      Label Lskip_simd;
+
+      __ cmp(len, 16);
+      __ b(Lskip_simd, Assembler::LO);
+      __ call(StubRoutines::aarch32::string_inflate_neon());
+      __ b(Ldone, Assembler::EQ);
+      __ bind(Lskip_simd);
+    }
+
+    // nothing better we could do with Aarch32 basic instruction set
+    __ subs(len, len, 1);
+    __ b(Lone_char, Assembler::EQ);
+    __ bind(Lloop1); {
+      __ ldrb(tmp1, __ post(src, 1));
+      __ strh(tmp1, __ post(dst, 2));
+      __ ldrb(tmp1, __ post(src, 1));
+      __ strh(tmp1, __ post(dst, 2));
+      __ subs(len, len, 2);
+      __ b(Lloop1, Assembler::HI);
+    }
+    __ b(Ldone, Assembler::LO);
+
+    __ bind(Lone_char);
+    __ ldrb(tmp1, __ post(src, 1));
+    __ strh(tmp1, __ post(dst, 2));
+
+    __ bind(Ldone);
+    BLOCK_COMMENT("} byte_array_inflate");
+  %}
+
+%}
+
+//----------FRAME--------------------------------------------------------------
+// Definition of frame structure and management information.
+//
+//  S T A C K   L A Y O U T    Allocators stack-slot number
+//                             |   (to get allocators register number
+//  G  Owned by    |        |  v    add VMRegImpl::stack0)
+//  r   CALLER     |        |
+//  o     |        +--------+      pad to even-align allocators stack-slot
+//  w     V        |  pad0  |        numbers; owned by CALLER
+//  t   -----------+--------+----> Matcher::_in_arg_limit, unaligned
+//  h     ^        |   in   |  5
+//        |        |  args  |  4   Holes in incoming args owned by SELF
+//  |     |        |        |  3
+//  |     |        +--------+
+//  V     |        | old out|      Empty on Intel, window on Sparc
+//        |    old |preserve|      Must be even aligned.
+//        |     SP-+--------+----> Matcher::_old_SP, 8 (or 16 in LP64)-byte aligned
+//        |        |   in   |  3   area for Intel ret address
+//     Owned by    |preserve|      Empty on Sparc.
+//       SELF      +--------+
+//        |        |  pad2  |  2   pad to align old SP
+//        |        +--------+  1
+//        |        | locks  |  0
+//        |        +--------+----> VMRegImpl::stack0, 8 (or 16 in LP64)-byte aligned
+//        |        |  pad1  | 11   pad to align new SP
+//        |        +--------+
+//        |        |        | 10
+//        |        | spills |  9   spills
+//        V        |        |  8   (pad0 slot for callee)
+//      -----------+--------+----> Matcher::_out_arg_limit, unaligned
+//        ^        |  out   |  7
+//        |        |  args  |  6   Holes in outgoing args owned by CALLEE
+//     Owned by    +--------+
+//      CALLEE     | new out|  6   Empty on Intel, window on Sparc
+//        |    new |preserve|      Must be even-aligned.
+//        |     SP-+--------+----> Matcher::_new_SP, even aligned
+//        |        |        |
+//
+// Note 1: Only region 8-11 is determined by the allocator.  Region 0-5 is
+//         known from SELF's arguments and the Java calling convention.
+//         Region 6-7 is determined per call site.
+// Note 2: If the calling convention leaves holes in the incoming argument
+//         area, those holes are owned by SELF.  Holes in the outgoing area
+//         are owned by the CALLEE.  Holes should not be nessecary in the
+//         incoming area, as the Java calling convention is completely under
+//         the control of the AD file.  Doubles can be sorted and packed to
+//         avoid holes.  Holes in the outgoing arguments may be nessecary for
+//         varargs C calling conventions.
+// Note 3: Region 0-3 is even aligned, with pad2 as needed.  Region 3-5 is
+//         even aligned with pad0 as needed.
+//         Region 6 is even aligned.  Region 6-7 is NOT even aligned;
+//         region 6-11 is even aligned; it may be padded out more so that
+//         the region from SP to FP meets the minimum stack alignment.
+
+frame %{
+  // What direction does stack grow in (assumed to be same for native & Java)
+  stack_direction(TOWARDS_LOW);
+
+  // These two registers define part of the calling convention
+  // between compiled code and the interpreter.
+  inline_cache_reg(R_Ricklass);          // Inline Cache Register or Method* for I2C
+  interpreter_method_oop_reg(R_Rmethod); // Method Oop Register when calling interpreter
+
+  // Optional: name the operand used by cisc-spilling to access [stack_pointer + offset]
+  cisc_spilling_operand_name(indOffset);
+
+  // Number of stack slots consumed by a Monitor enter
+  sync_stack_slots(1 * VMRegImpl::slots_per_word);
+
+  // Compiled code's Frame Pointer
+  frame_pointer(R_R13);
+
+  // Stack alignment requirement
+  stack_alignment(StackAlignmentInBytes);
+  //  LP64: Alignment size in bytes (128-bit -> 16 bytes)
+  // !LP64: Alignment size in bytes (64-bit  ->  8 bytes)
+
+  // Number of stack slots between incoming argument block and the start of
+  // a new frame.  The PROLOG must add this many slots to the stack.  The
+  // EPILOG must remove this many slots.
+  // FP + LR
+  in_preserve_stack_slots(2 * VMRegImpl::slots_per_word);
+
+  // Number of outgoing stack slots killed above the out_preserve_stack_slots
+  // for calls to C.  Supports the var-args backing area for register parms.
+  // ADLC doesn't support parsing expressions, so I folded the math by hand.
+  varargs_C_out_slots_killed( 0);
+
+  // The after-PROLOG location of the return address.  Location of
+  // return address specifies a type (REG or STACK) and a number
+  // representing the register number (i.e. - use a register name) or
+  // stack slot.
+  // Ret Addr is on stack in slot 0 if no locks or verification or alignment.
+  // Otherwise, it is above the locks and verification slot and alignment word
+  return_addr(STACK - 1*VMRegImpl::slots_per_word +
+              align_up((Compile::current()->in_preserve_stack_slots() +
+                        Compile::current()->fixed_slots()),
+                       stack_alignment_in_slots()));
+
+  // Body of function which returns an OptoRegs array locating
+  // arguments either in registers or in stack slots for calling
+  // java
+  calling_convention %{
+    (void) SharedRuntime::java_calling_convention(sig_bt, regs, length, is_outgoing);
+
+  %}
+
+  // Body of function which returns an OptoRegs array locating
+  // arguments either in registers or in stack slots for callin
+  // C.
+  c_calling_convention %{
+    // This is obviously always outgoing
+    (void) SharedRuntime::c_calling_convention(sig_bt, regs, /*regs2=*/NULL, length);
+  %}
+
+  // Location of compiled Java return values.
+  return_value %{
+    return c2::return_value(ideal_reg);
+  %}
+
+  // Location of C return values.
+  c_return_value %{
+#ifndef HARD_FLOAT_CC
+    return c2::c_return_value(ideal_reg);
+#else
+    return c2::return_value(ideal_reg);
+#endif
+  %}
+
+%}
+
+//----------ATTRIBUTES---------------------------------------------------------
+//----------Instruction Attributes---------------------------------------------
+ins_attrib ins_cost(DEFAULT_COST); // Required cost attribute
+ins_attrib ins_size(32);           // Required size attribute (in bits)
+ins_attrib ins_short_branch(0);    // Required flag: is this instruction a
+                                   // non-matching short branch variant of some
+                                                            // long branch?
+
+//----------OPERANDS-----------------------------------------------------------
+// Operand definitions must precede instruction definitions for correct parsing
+// in the ADLC because operands constitute user defined types which are used in
+// instruction definitions.
+
+//----------Simple Operands----------------------------------------------------
+// Immediate Operands
+// Integer Immediate: 32-bit
+operand immI() %{
+  match(ConI);
+
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 8-bit unsigned - for VMOV
+operand immU8() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 255));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 16-bit
+operand immI16() %{
+  predicate((n->get_int() >> 16) == 0 && (VM_Version::features() & FT_ARMV6T2));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: offset for half and double word loads and stores
+operand immIHD() %{
+  predicate(is_memoryHD(n->get_int()));
+  match(ConI);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: offset for fp loads and stores
+operand immIFP() %{
+  predicate(is_memoryfp(n->get_int()) && ((n->get_int() & 3) == 0));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Valid scale values for addressing modes and shifts
+operand immU5() %{
+  predicate(0 <= n->get_int() && (n->get_int() <= 31));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 6-bit
+operand immU6Big() %{
+  predicate(n->get_int() >= 32 && n->get_int() <= 63);
+  match(ConI);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: 0-bit
+operand immI0() %{
+  predicate(n->get_int() == 0);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 1
+operand immI_1() %{
+  predicate(n->get_int() == 1);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 2
+operand immI_2() %{
+  predicate(n->get_int() == 2);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 3
+operand immI_3() %{
+  predicate(n->get_int() == 3);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 4
+operand immI_4() %{
+  predicate(n->get_int() == 4);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 8
+operand immI_8() %{
+  predicate(n->get_int() == 8);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Int Immediate non-negative
+operand immU31()
+%{
+  predicate(n->get_int() >= 0);
+  match(ConI);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the values 32-63
+operand immI_32_63() %{
+  predicate(n->get_int() >= 32 && n->get_int() <= 63);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Immediates for special shifts (sign extend)
+
+// Integer Immediate: the value 16
+operand immI_16() %{
+  predicate(n->get_int() == 16);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 24
+operand immI_24() %{
+  predicate(n->get_int() == 24);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 255
+operand immI_255() %{
+  predicate( n->get_int() == 255 );
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediate: the value 65535
+operand immI_65535() %{
+  predicate(n->get_int() == 65535);
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediates for arithmetic instructions
+
+operand aimmI() %{
+  predicate(is_aimm(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmIneg() %{
+  predicate(is_aimm(-n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand aimmU31() %{
+  predicate((0 <= n->get_int()) && is_aimm(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Immediates for logical instructions
+
+operand limmI() %{
+  predicate(is_limmI(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmIlow8() %{
+  predicate(is_limmI_low(n->get_int(), 8));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmU31() %{
+  predicate(0 <= n->get_int() && is_limmI(n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand limmIn() %{
+  predicate(is_limmI(~n->get_int()));
+  match(ConI);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: the value FF
+operand immL_FF() %{
+  predicate( n->get_long() == 0xFFL );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: the value FFFF
+operand immL_FFFF() %{
+  predicate( n->get_long() == 0xFFFFL );
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Pointer Immediate: 32 or 64-bit
+operand immP() %{
+  match(ConP);
+
+  op_cost(5);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immP0() %{
+  predicate(n->get_ptr() == 0);
+  match(ConP);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immP_poll() %{
+  predicate(n->get_ptr() != 0 && n->get_ptr() == (intptr_t)os::get_polling_page());
+  match(ConP);
+
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Pointer Immediate
+operand immN()
+%{
+  match(ConN);
+
+  op_cost(10);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immNKlass()
+%{
+  match(ConNKlass);
+
+  op_cost(10);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// NULL Pointer Immediate
+operand immN0()
+%{
+  predicate(n->get_narrowcon() == 0);
+  match(ConN);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immL() %{
+  match(ConL);
+  op_cost(40);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand immL0() %{
+  predicate(n->get_long() == 0L);
+  match(ConL);
+  op_cost(0);
+  // formats are generated automatically for constants and base registers
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: 16-bit
+operand immL16() %{
+  predicate(n->get_long() >= 0 && n->get_long() < (1<<16)  && (VM_Version::features() & FT_ARMV6T2));
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Long Immediate: low 32-bit mask
+operand immL_32bits() %{
+  predicate(n->get_long() == 0xFFFFFFFFL);
+  match(ConL);
+  op_cost(0);
+
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Double Immediate
+operand immD() %{
+  match(ConD);
+
+  op_cost(40);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Double Immediate: +0.0d.
+operand immD0() %{
+  predicate(jlong_cast(n->getd()) == 0);
+
+  match(ConD);
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+operand imm8D() %{
+  predicate(Assembler::operand_valid_for_double_immediate(n->getd()));
+  match(ConD);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate
+operand immF() %{
+  match(ConF);
+
+  op_cost(20);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate: +0.0f
+operand immF0() %{
+  predicate(jint_cast(n->getf()) == 0);
+  match(ConF);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Float Immediate: encoded as 8 bits
+operand imm8F() %{
+  predicate(Assembler::operand_valid_for_float_immediate(n->getf()));
+  match(ConF);
+
+  op_cost(0);
+  format %{ %}
+  interface(CONST_INTER);
+%}
+
+// Integer Register Operands
+// Integer Register
+operand iRegI() %{
+  constraint(ALLOC_IN_RC(int_reg));
+  match(RegI);
+  match(R0RegI);
+  match(R1RegI);
+  match(R2RegI);
+  match(R3RegI);
+  match(R12RegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Pointer Register
+operand iRegP() %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(RegP);
+  match(R0RegP);
+  match(R1RegP);
+  match(R2RegP);
+  match(RExceptionRegP);
+  match(RmethodRegP); // R8
+  match(R9RegP);
+  match(RthreadRegP); // R10, TODO Oracle FIXME: move to sp_ptr_RegP?
+  match(R12RegP);
+  match(LRRegP);
+
+  match(sp_ptr_RegP);
+  match(store_ptr_RegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// GPRs + Rmethod + Rthread + SP
+operand sp_ptr_RegP() %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(RegP);
+  match(iRegP);
+  match(SPRegP); // FIXME: check cost
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R0RegP() %{
+  constraint(ALLOC_IN_RC(R0_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R1RegP() %{
+  constraint(ALLOC_IN_RC(R1_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2RegP() %{
+  constraint(ALLOC_IN_RC(R2_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand RExceptionRegP() %{
+  constraint(ALLOC_IN_RC(Rexception_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand RthreadRegP() %{
+  constraint(ALLOC_IN_RC(Rthread_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand RmethodRegP() %{
+  constraint(ALLOC_IN_RC(Rmethod_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand IPRegP() %{
+  constraint(ALLOC_IN_RC(IP_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand LRRegP() %{
+  constraint(ALLOC_IN_RC(LR_regP));
+  match(iRegP);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R0RegI() %{
+  constraint(ALLOC_IN_RC(R0_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R1RegI() %{
+  constraint(ALLOC_IN_RC(R1_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2RegI() %{
+  constraint(ALLOC_IN_RC(R2_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R3RegI() %{
+  constraint(ALLOC_IN_RC(R3_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R9RegI() %{
+  constraint(ALLOC_IN_RC(R9_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R12RegI() %{
+  constraint(ALLOC_IN_RC(R12_regI));
+  match(iRegI);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Long Register
+operand iRegL() %{
+  constraint(ALLOC_IN_RC(long_reg));
+  match(RegL);
+  match(R0R1RegL);
+  match(R2R3RegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand iRegLd() %{
+  constraint(ALLOC_IN_RC(long_reg_align));
+  match(iRegL); // FIXME: allows unaligned R11/R12?
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// first long arg, or return value
+operand R0R1RegL() %{
+  constraint(ALLOC_IN_RC(R0R1_regL));
+  match(iRegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand R2R3RegL() %{
+  constraint(ALLOC_IN_RC(R2R3_regL));
+  match(iRegL);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Flag Register
+operand flagsReg() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr" %}
+  interface(REG_INTER);
+%}
+
+// Result of compare to 0 (TST)
+operand flagsReg_EQNELTGE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_EQNELTGE" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, unsigned comparisons.
+operand flagsRegU() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+#ifdef TODO
+  match(RegFlagsP);
+#endif
+
+  format %{ "apsr_U" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, pointer comparisons.
+operand flagsRegP() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_P" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, long comparisons.
+operand flagsRegL_LTGE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_LTGE" %}
+  interface(REG_INTER);
+%}
+
+operand flagsRegUL() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_UL" %}
+  interface(REG_INTER);
+%}
+
+operand flagsRegL_EQNE() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_EQNE" %}
+  interface(REG_INTER);
+%}
+
+operand flagsRegL_LEGT() %{
+  constraint(ALLOC_IN_RC(int_flags));
+  match(RegFlags);
+
+  format %{ "apsr_L_LEGT" %}
+  interface(REG_INTER);
+%}
+
+// Condition Code Register, floating comparisons, unordered same as "less".
+operand flagsRegF() %{
+  constraint(ALLOC_IN_RC(float_flags));
+  match(RegFlags);
+
+  format %{ "fpscr_F" %}
+  interface(REG_INTER);
+%}
+
+// Vectors
+operand vecD() %{
+  constraint(ALLOC_IN_RC(actual_dflt_reg));
+  match(VecD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand vecX() %{
+  constraint(ALLOC_IN_RC(vectorx_reg));
+  match(VecX);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regD() %{
+  constraint(ALLOC_IN_RC(actual_dflt_reg));
+  match(RegD);
+  match(regD_low);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand Q0_regD() %{
+  constraint(ALLOC_IN_RC(D0D1_regD));
+  match(RegD);
+  match(regD_low);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand Q1_regD() %{
+  constraint(ALLOC_IN_RC(D2D3_regD));
+  match(RegD);
+  match(regD_low);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regF() %{
+  constraint(ALLOC_IN_RC(sflt_reg));
+  match(RegF);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand regD_low() %{
+  constraint(ALLOC_IN_RC(dflt_low_reg));
+  match(RegD);
+
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+// Special Registers
+
+// Method Register
+operand inline_cache_regP(iRegP reg) %{
+  constraint(ALLOC_IN_RC(Ricklass_regP));
+  match(reg);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+operand interpreter_method_oop_regP(iRegP reg) %{
+  constraint(ALLOC_IN_RC(Rmethod_regP));
+  match(reg);
+  format %{ %}
+  interface(REG_INTER);
+%}
+
+
+//----------Complex Operands---------------------------------------------------
+// Indirect Memory Reference
+operand indirect(sp_ptr_RegP reg) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(reg);
+
+  op_cost(100);
+  format %{ "[$reg]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
+// Indirect with Offset in ]-4096, 4096[
+operand indOffset12(sp_ptr_RegP reg, immI12 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with offset for float load/store
+operand indOffsetFP(sp_ptr_RegP reg, immIFP offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset for half and double words
+operand indOffsetHD(sp_ptr_RegP reg, immIHD offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset and Offset+4 in ]-1024, 1024[
+operand indOffsetFPx2(sp_ptr_RegP reg, immX10x2 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Offset and Offset+4 in ]-4096, 4096[
+operand indOffset12x2(sp_ptr_RegP reg, immI12x2 offset) %{
+  constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(AddP reg offset);
+
+  op_cost(100);
+  format %{ "[$reg + $offset]" %}
+  interface(MEMORY_INTER) %{
+    base($reg);
+    index(0xf); // PC => no index
+    scale(0x0);
+    disp($offset);
+  %}
+%}
+
+// Indirect with Register Index
+operand indIndex(iRegP addr, iRegX index) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr index);
+
+  op_cost(100);
+  format %{ "[$addr + $index]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale(0x0);
+    disp(0x0);
+  %}
+%}
+
+// Indirect Memory Times Scale Plus Index Register
+operand indIndexScale(iRegP addr, iRegX index, immU5 scale) %{
+  constraint(ALLOC_IN_RC(ptr_reg));
+  match(AddP addr (LShiftX index scale));
+
+  op_cost(100);
+  format %{"[$addr + $index << $scale]" %}
+  interface(MEMORY_INTER) %{
+    base($addr);
+    index($index);
+    scale($scale);
+    disp(0x0);
+  %}
+%}
+
+// Operands for expressing Control Flow
+// NOTE:  Label is a predefined operand which should not be redefined in
+//        the AD file.  It is generically handled within the ADLC.
+
+//----------Conditional Branch Operands----------------------------------------
+// Comparison Op  - This is the operation of the comparison, and is limited to
+//                  the following set of codes:
+//                  L (<), LE (<=), G (>), GE (>=), E (==), NE (!=)
+//
+// Other attributes of the comparison, such as unsignedness, are specified
+// by the comparison instruction that sets a condition code flags register.
+// That result is represented by a flags operand whose subtype is appropriate
+// to the unsignedness (etc.) of the comparison.
+//
+// Later, the instruction which matches both the Comparison Op (a Bool) and
+// the flags (produced by the Cmp) specifies the coding of the comparison op
+// by matching a specific subtype of Bool operand below, such as cmpOpU.
+
+operand cmpOp() %{
+  match(Bool);
+
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xb);
+    greater_equal(0xa);
+    less_equal(0xd);
+    greater(0xc);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// integer comparison with 0, signed
+operand cmpOp0() %{
+  match(Bool);
+
+  format %{ "" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x4);
+    greater_equal(0x5);
+    less_equal(0xd); // unsupported
+    greater(0xc); // unsupported
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// Comparison Op, unsigned
+operand cmpOpU() %{
+  match(Bool);
+
+  format %{ "u" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x3);
+    greater_equal(0x2);
+    less_equal(0x9);
+    greater(0x8);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+// Comparison Op, pointer (same as unsigned)
+operand cmpOpP() %{
+  match(Bool);
+
+  format %{ "p" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0x3);
+    greater_equal(0x2);
+    less_equal(0x9);
+    greater(0x8);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+operand cmpOpL() %{
+  match(Bool);
+
+  format %{ "L" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xb);
+    greater_equal(0xa);
+    less_equal(0xd);
+    greater(0xc);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+operand cmpOpL_commute() %{
+  match(Bool);
+
+  format %{ "L" %}
+  interface(COND_INTER) %{
+    equal(0x0);
+    not_equal(0x1);
+    less(0xc);
+    greater_equal(0xd);
+    less_equal(0xa);
+    greater(0xb);
+    overflow(0x0); // unsupported/unimplemented
+    no_overflow(0x0); // unsupported/unimplemented
+  %}
+%}
+
+//----------OPERAND CLASSES----------------------------------------------------
+// Operand Classes are groups of operands that are used to simplify
+// instruction definitions by not requiring the AD writer to specify separate
+// instructions for every form of operand when the instruction accepts
+// multiple operand types with the same basic encoding and format.  The classic
+// case of this is memory operands.
+opclass memoryI ( indirect, indOffset12, indIndex, indIndexScale );
+opclass memoryP ( indirect, indOffset12, indIndex, indIndexScale );
+opclass memoryF ( indirect, indOffsetFP );
+opclass memoryF2 ( indirect, indOffsetFPx2 );
+opclass memoryD ( indirect, indOffsetFP );
+opclass memoryfp( indirect, indOffsetFP );
+opclass memoryB ( indirect, indIndex, indOffsetHD );
+opclass memoryS ( indirect, indIndex, indOffsetHD );
+opclass memoryL ( indirect, indIndex, indOffsetHD );
+
+opclass memoryScaledI(indIndexScale);
+opclass memoryScaledP(indIndexScale);
+
+// when ldrex/strex is used:
+opclass memoryex ( indirect );
+opclass indIndexMemory( indIndex );
+opclass memorylong ( indirect, indOffset12x2 );
+opclass memoryvld ( indirect /* , write back mode not implemented */ );
+
+//----------PIPELINE-----------------------------------------------------------
+pipeline %{
+
+//----------ATTRIBUTES---------------------------------------------------------
+attributes %{
+  fixed_size_instructions;           // Fixed size instructions
+  max_instructions_per_bundle = 4;   // Up to 4 instructions per bundle
+  instruction_unit_size = 4;         // An instruction is 4 bytes long
+  instruction_fetch_unit_size = 16;  // The processor fetches one line
+  instruction_fetch_units = 1;       // of 16 bytes
+
+  // List of nop instructions
+  nops( Nop_A0, Nop_A1, Nop_MS, Nop_FA, Nop_BR );
+%}
+
+//----------RESOURCES----------------------------------------------------------
+// Resources are the functional units available to the machine
+resources(A0, A1, MS, BR, FA, FM, IDIV, FDIV, IALU = A0 | A1);
+
+//----------PIPELINE DESCRIPTION-----------------------------------------------
+// Pipeline Description specifies the stages in the machine's pipeline
+
+pipe_desc(A, P, F, B, I, J, S, R, E, C, M, W, X, T, D);
+
+//----------PIPELINE CLASSES---------------------------------------------------
+// Pipeline Classes describe the stages in which input and output are
+// referenced by the hardware pipeline.
+
+// Integer ALU reg-reg operation
+pipe_class ialu_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg long operation
+pipe_class ialu_reg_reg_2(iRegL dst, iRegL src1, iRegL src2) %{
+    instruction_count(2);
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg long dependent operation
+pipe_class ialu_reg_reg_2_dep(iRegL dst, iRegL src1, iRegL src2, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    cr    : E(write);
+    IALU  : R(2);
+%}
+
+// Integer ALU reg-imm operaion
+pipe_class ialu_reg_imm(iRegI dst, iRegI src1) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code
+pipe_class ialu_cc_reg_reg(iRegI dst, iRegI src1, iRegI src2, flagsReg cr) %{
+    single_instruction;
+    dst   : E(write);
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU zero-reg operation
+pipe_class ialu_zero_reg(iRegI dst, immI0 zero, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU zero-reg operation with condition code only
+pipe_class ialu_cconly_zero_reg(flagsReg cr, iRegI src) %{
+    single_instruction;
+    cr    : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code only
+pipe_class ialu_cconly_reg_reg(flagsReg cr, iRegI src1, iRegI src2) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-imm operation with condition code only
+pipe_class ialu_cconly_reg_imm(flagsReg cr, iRegI src1) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg-zero operation with condition code only
+pipe_class ialu_cconly_reg_reg_zero(flagsReg cr, iRegI src1, iRegI src2, immI0 zero) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-imm-zero operation with condition code only
+pipe_class ialu_cconly_reg_imm_zero(flagsReg cr, iRegI src1, immI0 zero) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg operation with condition code, src1 modified
+pipe_class ialu_cc_rwreg_reg(flagsReg cr, iRegI src1, iRegI src2) %{
+    single_instruction;
+    cr    : E(write);
+    src1  : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+pipe_class cmpL_reg(iRegI dst, iRegL src1, iRegL src2, flagsReg cr ) %{
+    multiple_bundles;
+    dst   : E(write)+4;
+    cr    : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R(3);
+    BR    : R(2);
+%}
+
+// Integer ALU operation
+pipe_class ialu_none(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Integer ALU reg operation
+pipe_class ialu_reg(iRegI dst, iRegI src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU reg conditional operation
+// This instruction has a 1 cycle stall, and cannot execute
+// in the same cycle as the instruction setting the condition
+// code. We kludge this by pretending to read the condition code
+// 1 cycle earlier, and by marking the functional units as busy
+// for 2 cycles with the result available 1 cycle later than
+// is really the case.
+pipe_class ialu_reg_flags( iRegI op2_out, iRegI op2_in, iRegI op1, flagsReg cr ) %{
+    single_instruction;
+    op2_out : C(write);
+    op1     : R(read);
+    cr      : R(read);       // This is really E, with a 1 cycle stall
+    BR      : R(2);
+    MS      : R(2);
+%}
+
+// Integer ALU reg operation
+pipe_class ialu_move_reg_L_to_I(iRegI dst, iRegL src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+pipe_class ialu_move_reg_I_to_L(iRegL dst, iRegI src) %{
+    single_instruction; may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Two integer ALU reg operations
+pipe_class ialu_reg_2(iRegL dst, iRegL src) %{
+    instruction_count(2);
+    dst   : E(write);
+    src   : R(read);
+    A0    : R;
+    A1    : R;
+%}
+
+// Two integer ALU reg operations
+pipe_class ialu_move_reg_L_to_L(iRegL dst, iRegL src) %{
+    instruction_count(2); may_have_no_code;
+    dst   : E(write);
+    src   : R(read);
+    A0    : R;
+    A1    : R;
+%}
+
+// Integer ALU imm operation
+pipe_class ialu_imm(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+pipe_class ialu_imm_n(iRegI dst) %{
+    single_instruction;
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Integer ALU reg-reg with carry operation
+pipe_class ialu_reg_reg_cy(iRegI dst, iRegI src1, iRegI src2, iRegI cy) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc operation
+pipe_class ialu_cc(iRegI dst, flagsReg cc) %{
+    single_instruction;
+    dst   : E(write);
+    cc    : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc / second IALU operation
+pipe_class ialu_reg_ialu( iRegI dst, iRegI src ) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    src   : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU cc / second IALU operation
+pipe_class ialu_reg_reg_ialu( iRegI dst, iRegI p, iRegI q ) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    p     : R(read);
+    q     : R(read);
+    IALU  : R;
+%}
+
+// Integer ALU hi-lo-reg operation
+pipe_class ialu_hi_lo_reg(iRegI dst, immI src) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write)+1;
+    IALU  : R(2);
+%}
+
+// Long Constant
+pipe_class loadConL( iRegL dst, immL src ) %{
+    instruction_count(2); multiple_bundles;
+    dst   : E(write)+1;
+    IALU  : R(2);
+    IALU  : R(2);
+%}
+
+// Pointer Constant
+pipe_class loadConP( iRegP dst, immP src ) %{
+    instruction_count(0); multiple_bundles;
+    fixed_latency(6);
+%}
+
+// Polling Address
+pipe_class loadConP_poll( iRegP dst, immP_poll src ) %{
+    dst   : E(write);
+    IALU  : R;
+%}
+
+// Long Constant small
+pipe_class loadConLlo( iRegL dst, immL src ) %{
+    instruction_count(2);
+    dst   : E(write);
+    IALU  : R;
+    IALU  : R;
+%}
+
+// [PHH] This is wrong for 64-bit.  See LdImmF/D.
+pipe_class loadConFD(regF dst, immF src, iRegP tmp) %{
+    instruction_count(1); multiple_bundles;
+    src   : R(read);
+    dst   : M(write)+1;
+    IALU  : R;
+    MS    : E;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop() %{
+    single_instruction;
+    IALU  : R;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop_A0() %{
+    single_instruction;
+    A0    : R;
+%}
+
+// Integer ALU nop operation
+pipe_class ialu_nop_A1() %{
+    single_instruction;
+    A1    : R;
+%}
+
+// Integer Multiply reg-reg operation
+pipe_class imul_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+    single_instruction;
+    dst   : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    MS    : R(5);
+%}
+
+pipe_class mulL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+    single_instruction;
+    dst   : E(write)+4;
+    src1  : R(read);
+    src2  : R(read);
+    MS    : R(6);
+%}
+
+// Integer Divide reg-reg
+pipe_class sdiv_reg_reg_IDIV(iRegI dst, iRegI src1, iRegI src2, iRegI temp, flagsReg cr) %{
+    single_instruction;
+    dst   : E(write);
+    temp  : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    temp  : R(read);
+    MS    : R(10);
+%}
+
+pipe_class sdiv_reg_reg_SW(iRegI dst, iRegI src1, iRegI src2, iRegI temp1, iRegI temp2, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : E(write);
+    temp1 : E(write);
+    temp2 : E(write);
+    src1  : R(read);
+    src2  : R(read);
+    temp1 : R(read);
+    temp2 : R(read);
+    MS    : R(38);
+%}
+
+// Long Divide
+pipe_class divL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+    dst  : E(write)+71;
+    src1 : R(read);
+    src2 : R(read)+1;
+    MS   : R(70);
+%}
+
+// Floating Point Add Float
+pipe_class faddF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Add Double
+pipe_class faddD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Conditional Move based on integer flags
+pipe_class int_conditional_float_move (cmpOp cmp, flagsReg cr, regF dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    cr    : R(read);
+    FA    : R(2);
+    BR    : R(2);
+%}
+
+// Floating Point Conditional Move based on integer flags
+pipe_class int_conditional_double_move (cmpOp cmp, flagsReg cr, regD dst, regD src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    cr    : R(read);
+    FA    : R(2);
+    BR    : R(2);
+%}
+
+// Floating Point Multiply Float
+pipe_class fmulF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+%}
+
+// Floating Point Multiply Double
+pipe_class fmulD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+%}
+
+// Floating Point Divide Float
+pipe_class fdivF_reg_reg(regF dst, regF src1, regF src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+    FDIV  : C(14);
+%}
+
+// Floating Point Divide Double
+pipe_class fdivD_reg_reg(regD dst, regD src1, regD src2) %{
+    single_instruction;
+    dst   : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FM    : R;
+    FDIV  : C(17);
+%}
+
+// Floating Point Move/Negate/Abs Float
+pipe_class faddF_reg(regF dst, regF src) %{
+    single_instruction;
+    dst   : W(write);
+    src   : E(read);
+    FA    : R(1);
+%}
+
+// Floating Point Move/Negate/Abs Double
+pipe_class faddD_reg(regD dst, regD src) %{
+    single_instruction;
+    dst   : W(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->D
+pipe_class fcvtF2D(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->D
+pipe_class fcvtI2D(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert LHi->D
+pipe_class fcvtLHi2D(regD dst, regD src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert L->D
+pipe_class fcvtL2D(regD dst, iRegL src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert L->F
+pipe_class fcvtL2F(regF dst, iRegL src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->F
+pipe_class fcvtD2F(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->L
+pipe_class fcvtI2L(regD dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->F
+pipe_class fcvtD2I(iRegI dst, regD src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert D->L
+pipe_class fcvtD2L(regD dst, regD src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->I
+pipe_class fcvtF2I(regF dst, regF src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert F->L
+pipe_class fcvtF2L(regD dst, regF src, flagsReg cr) %{
+    instruction_count(1); multiple_bundles;
+    dst   : X(write)+6;
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Convert I->F
+pipe_class fcvtI2F(regF dst, regF src) %{
+    single_instruction;
+    dst   : X(write);
+    src   : E(read);
+    FA    : R;
+%}
+
+// Floating Point Compare
+pipe_class faddF_fcc_reg_reg_zero(flagsRegF cr, regF src1, regF src2, immI0 zero) %{
+    single_instruction;
+    cr    : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Point Compare
+pipe_class faddD_fcc_reg_reg_zero(flagsRegF cr, regD src1, regD src2, immI0 zero) %{
+    single_instruction;
+    cr    : X(write);
+    src1  : E(read);
+    src2  : E(read);
+    FA    : R;
+%}
+
+// Floating Add Nop
+pipe_class fadd_nop() %{
+    single_instruction;
+    FA  : R;
+%}
+
+// Integer Store to Memory
+pipe_class istore_mem_reg(memoryI mem, iRegI src) %{
+    single_instruction;
+    mem   : R(read);
+    src   : C(read);
+    MS    : R;
+%}
+
+// Integer Store to Memory
+pipe_class istore_mem_spORreg(memoryI mem, sp_ptr_RegP src) %{
+    single_instruction;
+    mem   : R(read);
+    src   : C(read);
+    MS    : R;
+%}
+
+// Float Store
+pipe_class fstoreF_mem_reg(memoryF mem, RegF src) %{
+    single_instruction;
+    mem : R(read);
+    src : C(read);
+    MS  : R;
+%}
+
+// Float Store
+pipe_class fstoreF_mem_zero(memoryF mem, immF0 src) %{
+    single_instruction;
+    mem : R(read);
+    MS  : R;
+%}
+
+// Double Store
+pipe_class fstoreD_mem_reg(memoryD mem, RegD src) %{
+    instruction_count(1);
+    mem : R(read);
+    src : C(read);
+    MS  : R;
+%}
+
+// Double Store
+pipe_class fstoreD_mem_zero(memoryD mem, immD0 src) %{
+    single_instruction;
+    mem : R(read);
+    MS  : R;
+%}
+
+// Integer Load (when sign bit propagation not needed)
+pipe_class iload_mem(iRegI dst, memoryI mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : C(write);
+    MS  : R;
+%}
+
+// Integer Load (when sign bit propagation or masking is needed)
+pipe_class iload_mask_mem(iRegI dst, memoryI mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Float Load
+pipe_class floadF_mem(regF dst, memoryF mem) %{
+    single_instruction;
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Float Load
+pipe_class floadD_mem(regD dst, memoryD mem) %{
+    instruction_count(1); multiple_bundles; // Again, unaligned argument is only multiple case
+    mem : R(read);
+    dst : M(write);
+    MS  : R;
+%}
+
+// Memory Nop
+pipe_class mem_nop() %{
+    single_instruction;
+    MS  : R;
+%}
+
+pipe_class sethi(iRegP dst, immI src) %{
+    single_instruction;
+    dst  : E(write);
+    IALU : R;
+%}
+
+pipe_class loadPollP(iRegP poll) %{
+    single_instruction;
+    poll : R(read);
+    MS   : R;
+%}
+
+pipe_class br(Universe br, label labl) %{
+    single_instruction_with_delay_slot;
+    BR  : R;
+%}
+
+pipe_class br_cc(Universe br, cmpOp cmp, flagsReg cr, label labl) %{
+    single_instruction_with_delay_slot;
+    cr    : E(read);
+    BR    : R;
+%}
+
+pipe_class br_reg(Universe br, cmpOp cmp, iRegI op1, label labl) %{
+    single_instruction_with_delay_slot;
+    op1 : E(read);
+    BR  : R;
+    MS  : R;
+%}
+
+pipe_class br_nop() %{
+    single_instruction;
+    BR  : R;
+%}
+
+pipe_class simple_call(method meth) %{
+    instruction_count(2); multiple_bundles; force_serialization;
+    fixed_latency(100);
+    BR  : R(1);
+    MS  : R(1);
+    A0  : R(1);
+%}
+
+pipe_class compiled_call(method meth) %{
+    instruction_count(1); multiple_bundles; force_serialization;
+    fixed_latency(100);
+    MS  : R(1);
+%}
+
+pipe_class call(method meth) %{
+    instruction_count(0); multiple_bundles; force_serialization;
+    fixed_latency(100);
+%}
+
+pipe_class tail_call(Universe ignore, label labl) %{
+    single_instruction; has_delay_slot;
+    fixed_latency(100);
+    BR  : R(1);
+    MS  : R(1);
+%}
+
+pipe_class ret(Universe ignore) %{
+    single_instruction; has_delay_slot;
+    BR  : R(1);
+    MS  : R(1);
+%}
+
+// The real do-nothing guy
+pipe_class empty( ) %{
+    instruction_count(0);
+%}
+
+pipe_class long_memory_op() %{
+    instruction_count(0); multiple_bundles; force_serialization;
+    fixed_latency(25);
+    MS  : R(1);
+%}
+
+// Check-cast
+pipe_class partial_subtype_check_pipe(Universe ignore, iRegP array, iRegP match ) %{
+    array : R(read);
+    match  : R(read);
+    IALU   : R(2);
+    BR     : R(2);
+    MS     : R;
+%}
+
+// Convert FPU flags into +1,0,-1
+pipe_class floating_cmp( iRegI dst, regF src1, regF src2 ) %{
+    src1  : E(read);
+    src2  : E(read);
+    dst   : E(write);
+    FA    : R;
+    MS    : R(2);
+    BR    : R(2);
+%}
+
+// Compare for p < q, and conditionally add y
+pipe_class cadd_cmpltmask( iRegI p, iRegI q, iRegI y ) %{
+    p     : E(read);
+    q     : E(read);
+    y     : E(read);
+    IALU  : R(3)
+%}
+
+// Perform a compare, then move conditionally in a branch delay slot.
+pipe_class min_max( iRegI src2, iRegI srcdst ) %{
+    src2   : E(read);
+    srcdst : E(read);
+    IALU   : R;
+    BR     : R;
+%}
+
+// Define the class for the Nop node
+define %{
+   MachNop = ialu_nop;
+%}
+
+%}
+
+//----------INSTRUCTIONS-------------------------------------------------------
+
+//------------Special Nop instructions for bundling - no match rules-----------
+// Nop using the A0 functional unit
+instruct Nop_A0() %{
+  ins_pipe(ialu_nop_A0);
+%}
+
+// Nop using the A1 functional unit
+instruct Nop_A1( ) %{
+  ins_pipe(ialu_nop_A1);
+%}
+
+// Nop using the memory functional unit
+instruct Nop_MS( ) %{
+  ins_pipe(mem_nop);
+%}
+
+// Nop using the floating add functional unit
+instruct Nop_FA( ) %{
+  ins_pipe(fadd_nop);
+%}
+
+// Nop using the branch functional unit
+instruct Nop_BR( ) %{
+  ins_pipe(br_nop);
+%}
+
+//----------Load/Store/Move Instructions---------------------------------------
+//----------Load Instructions--------------------------------------------------
+// Load Byte (8bit signed)
+instruct loadB(iRegI dst, memoryB mem) %{
+  match(Set dst (LoadB mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSB   $dst,$mem\t! byte -> int" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Byte (8bit signed) into a Long Register
+instruct loadB2L(iRegL dst, memoryB mem) %{
+  match(Set dst (ConvI2L (LoadB mem)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRSB $dst.lo,$mem\t! byte -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), $dst$$Register, asr(31));
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Byte (8bit UNsigned) into an int reg
+instruct loadUB(iRegI dst, memoryB mem) %{
+  match(Set dst (LoadUB mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRB   $dst,$mem\t! ubyte -> int" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Byte (8bit UNsigned) into a Long Register
+instruct loadUB2L(iRegL dst, memoryB mem) %{
+  match(Set dst (ConvI2L (LoadUB mem)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRB  $dst.lo,$mem\t! ubyte -> long\n\t"
+            "MOV   $dst.hi,0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Byte (8 bit UNsigned) with immediate mask into Long Register
+instruct loadUB2L_limmI(iRegL dst, memoryB mem, limmIlow8 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUB mem) mask)));
+
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+  size(12);
+  format %{ "LDRB  $dst.lo,$mem\t! ubyte -> long\n\t"
+            "MOV   $dst.hi,0\n\t"
+            "AND  $dst.lo,$dst.lo,$mask" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, limmI_low($mask$$constant, 8));
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Short (16bit signed)
+
+instruct loadS(iRegI dst, memoryS mem) %{
+  match(Set dst (LoadS mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSH   $dst,$mem\t! short" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Short (16 bit signed) to Byte (8 bit signed)
+instruct loadS2B(iRegI dst, memoryS mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadS mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRSB   $dst,$mem\t! short -> byte" %}
+  ins_encode %{
+    // High 32 bits are harmlessly set on Aarch64
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Short (16bit signed) into a Long Register
+instruct loadS2L(iRegL dst, memoryS mem) %{
+  match(Set dst (ConvI2L (LoadS mem)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRSH $dst.lo,$mem\t! short -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), $dst$$Register, asr(31));
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned)
+
+
+instruct loadUS(iRegI dst, memoryS mem) %{
+  match(Set dst (LoadUS mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRH   $dst,$mem\t! ushort/char" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16 bit UNsigned) to Byte (8 bit signed)
+instruct loadUS2B(iRegI dst, memoryB mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadUS mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSB   $dst,$mem\t! ushort -> byte" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) into a Long Register
+instruct loadUS2L(iRegL dst, memoryS mem) %{
+  match(Set dst (ConvI2L (LoadUS mem)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRH  $dst.lo,$mem\t! short -> long\n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) with mask 0xFF into a Long Register
+instruct loadUS2L_immI_255(iRegL dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRB  $dst.lo,$mem\t! \n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Short/Char (16bit UNsigned) with a immediate mask into a Long Register
+instruct loadUS2L_limmI(iRegL dst, memoryS mem, limmI mask) %{
+  match(Set dst (ConvI2L (AndI (LoadUS mem) mask)));
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+
+  size(12);
+  format %{ "LDRH   $dst,$mem\t! ushort/char & mask -> long\n\t"
+            "MOV    $dst.hi, 0\n\t"
+            "AND    $dst,$dst,$mask" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, $mask$$constant);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer
+
+instruct loadI(iRegI dst, memoryI mem) %{
+  match(Set dst (LoadI mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "ldr $dst,$mem\t! int" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer to Byte (8 bit signed)
+instruct loadI2B(iRegI dst, memoryS mem, immI_24 twentyfour) %{
+  match(Set dst (RShiftI (LShiftI (LoadI mem) twentyfour) twentyfour));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRSB   $dst,$mem\t! int -> byte" %}
+  ins_encode %{
+    __ ldrsb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Unsigned Byte (8 bit UNsigned)
+instruct loadI2UB(iRegI dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (AndI (LoadI mem) mask));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+
+  format %{ "LDRB   $dst,$mem\t! int -> ubyte" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Short (16 bit signed)
+instruct loadI2S(iRegI dst, memoryS mem, immI_16 sixteen) %{
+  match(Set dst (RShiftI (LShiftI (LoadI mem) sixteen) sixteen));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRSH   $dst,$mem\t! int -> short" %}
+  ins_encode %{
+    __ ldrsh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer to Unsigned Short (16 bit UNsigned)
+instruct loadI2US(iRegI dst, memoryS mem, immI_65535 mask) %{
+  match(Set dst (AndI (LoadI mem) mask));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDRH   $dst,$mem\t! int -> ushort/char" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer into a Long Register
+instruct loadI2L(iRegL dst, memoryI mem) %{
+  match(Set dst (ConvI2L (LoadI mem)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDR   $dst.lo,$mem\t! int -> long\n\t"
+            "ASR   $dst.hi,$dst.lo,31\t! int->long" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), $dst$$Register, asr(31));
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer with mask 0xFF into a Long Register
+instruct loadI2L_immI_255(iRegL dst, memoryB mem, immI_255 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRB   $dst.lo,$mem\t! int & 0xFF -> long\n\t"
+            "MOV    $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrb($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer with mask 0xFFFF into a Long Register
+instruct loadI2L_immI_65535(iRegL dst, memoryS mem, immI_65535 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDRH   $dst,$mem\t! int & 0xFFFF -> long\n\t"
+            "MOV    $dst.hi, 0" %}
+  ins_encode %{
+    __ ldrh($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mask_mem);
+%}
+
+// Load Integer with a 31-bit immediate mask into a Long Register
+instruct loadI2L_limmU31(iRegL dst, memoryI mem, limmU31 mask) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  ins_cost(MEMORY_REF_COST + 2*DEFAULT_COST);
+
+  size(12);
+  format %{ "LDR   $dst.lo,$mem\t! int -> long\n\t"
+            "MOV    $dst.hi, 0\n\t"
+            "AND   $dst,$dst,$mask" %}
+
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ andr($dst$$Register, $dst$$Register, $mask$$constant);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Integer with a 31-bit mask into a Long Register
+// FIXME: use iRegI mask, remove tmp?
+instruct loadI2L_immU31(iRegL dst, memoryI mem, immU31 mask, iRegI tmp) %{
+  match(Set dst (ConvI2L (AndI (LoadI mem) mask)));
+  effect(TEMP dst, TEMP tmp);
+
+  ins_cost(MEMORY_REF_COST + 4*DEFAULT_COST);
+  size(20);
+  format %{ "LDR      $mem,$dst\t! int & 31-bit mask -> long\n\t"
+            "MOV      $dst.hi, 0\n\t"
+            "MOV_SLOW $tmp,$mask\n\t"
+            "AND      $dst,$tmp,$dst" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+    __ mov($tmp$$Register, $mask$$constant);
+    __ andr($dst$$Register, $dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Unsigned Integer into a Long Register
+instruct loadUI2L(iRegL dst, memoryI mem, immL_32bits mask) %{
+  match(Set dst (AndL (ConvI2L (LoadI mem)) mask));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDR   $dst.lo,$mem\t! uint -> long\n\t"
+            "MOV   $dst.hi,0" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Long
+
+instruct loadL(iRegLd dst, memoryL mem ) %{
+  predicate(!((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  effect(TEMP dst);
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "ldrd  $dst,$mem\t! long" %}
+  ins_encode %{
+    __ ldrd($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_2instr(iRegL dst, memorylong mem ) %{
+  predicate(!((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST);
+
+  size(8);
+  format %{ "LDR    $dst.lo,$mem \t! long order of instrs reversed if $dst.lo == base($mem)\n\t"
+            "LDR    $dst.hi,$mem+4 or $mem" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+
+    if ($dst$$Register == reg_to_register_object($mem$$base)) {
+      __ ldr($dst$$Register->successor(), Amemhi);
+      __ ldr($dst$$Register, Amemlo);
+    } else {
+      __ ldr($dst$$Register, Amemlo);
+      __ ldr($dst$$Register->successor(), Amemhi);
+    }
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_volatile(iRegL dst, indirect mem ) %{
+  predicate(((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDREXD    $dst,$mem\t! long" %}
+  ins_encode %{
+    __ atomic_ldrd($dst$$Register, reg_to_register_object($dst$$reg + 1), reg_to_register_object($mem$$base));
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_volatile_fp(iRegL dst, memoryD mem ) %{
+  predicate(((LoadLNode*)n)->require_atomic_access());
+  match(Set dst (LoadL mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "FLDD      S14, $mem"
+            "FMRRD    $dst, S14\t! long \n't" %}
+  ins_encode %{
+    __ vldr_f64(f14, $mem$$Address);
+    __ vmov_f64($dst$$Register, $dst$$Register->successor(), f14);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadL_unaligned(iRegL dst, memorylong mem ) %{
+  match(Set dst (LoadL_unaligned mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(8);
+  format %{ "LDR    $dst.lo,$mem\t! long order of instrs reversed if $dst.lo == base($mem)\n\t"
+            "LDR    $dst.hi,$mem+4" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+
+    if ($dst$$Register == reg_to_register_object($mem$$base)) {
+      __ ldr($dst$$Register->successor(), Amemhi);
+      __ ldr($dst$$Register, Amemlo);
+    } else {
+      __ ldr($dst$$Register, Amemlo);
+      __ ldr($dst$$Register->successor(), Amemhi);
+    }
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Range
+instruct loadRange(iRegI dst, memoryI mem) %{
+  match(Set dst (LoadRange mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "LDR_u32 $dst,$mem\t! range" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Pointer
+
+instruct loadP(iRegP dst, memoryP mem) %{
+  match(Set dst (LoadP mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "LDR   $dst,$mem\t! ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+// Load Klass Pointer
+instruct loadKlass(iRegP dst, memoryI mem) %{
+  match(Set dst (LoadKlass mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "LDR   $dst,$mem\t! klass ptr" %}
+  ins_encode %{
+    __ ldr($dst$$Register, $mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadD(regD dst, memoryD mem) %{
+  match(Set dst (LoadD mem));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  // FIXME: needs to be atomic, but  ARMv7 A.R.M. guarantees
+  // only LDREXD and STREXD are 64-bit single-copy atomic
+  format %{ "FLDD   $dst,$mem" %}
+  ins_encode %{
+    __ vldr_f64($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadD_mem);
+%}
+
+// Load Double - UNaligned
+instruct loadD_unaligned(regD_low dst, memoryF2 mem ) %{
+  match(Set dst (LoadD_unaligned mem));
+  ins_cost(MEMORY_REF_COST*2+DEFAULT_COST);
+  size(8);
+  format %{ "FLDS    $dst.lo,$mem\t! misaligned double\n"
+          "\tFLDS    $dst.hi,$mem+4\t!" %}
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+      __ vldr_f32($dst$$FloatRegister, Amemlo);
+      __ vldr_f32($dst$$FloatRegister->successor(FloatRegisterImpl::SINGLE), Amemhi);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct loadF(regF dst, memoryF mem) %{
+  match(Set dst (LoadF mem));
+
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FLDS    $dst,$mem" %}
+  ins_encode %{
+    __ vldr_f32($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadF_mem);
+%}
+
+// // Load Constant
+instruct loadConI( iRegI dst, immI src ) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "MOV_SLOW    $dst, $src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(ialu_hi_lo_reg);
+%}
+
+instruct loadConIMov( iRegI dst, immIMov src ) %{
+  match(Set dst src);
+  size(4);
+  format %{ "MOV    $dst, $src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct loadConIMovn( iRegI dst, immIRotn src ) %{
+  match(Set dst src);
+  size(4);
+  format %{ "MVN    $dst, ~$src" %}
+  ins_encode %{
+    __ mvn_i($dst$$Register, ~$src$$constant);
+  %}
+  ins_pipe(ialu_imm_n);
+%}
+
+instruct loadConI16( iRegI dst, immI16 src ) %{
+  match(Set dst src);
+  size(4);
+  format %{ "MOVW    $dst, $src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(ialu_imm_n);
+%}
+
+instruct loadConP(iRegP dst, immP src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 3/2);
+  format %{ "MOV_SLOW    $dst,$src\t!ptr" %}
+  ins_encode %{
+    relocInfo::relocType constant_reloc = _opnds[1]->constant_reloc();
+    intptr_t val = $src$$constant;
+    if (constant_reloc == relocInfo::oop_type) {
+      __ movoop($dst$$Register, (jobject)val, true);
+    } else if (constant_reloc == relocInfo::metadata_type) {
+      __ mov_metadata($dst$$Register, (Metadata*)val);
+    } else {
+      __ mov($dst$$Register, val);
+    }
+  %}
+  ins_pipe(loadConP);
+%}
+
+
+instruct loadConP_poll(iRegP dst, immP_poll src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  format %{ "MOV_SLOW    $dst,$src\t!ptr" %}
+  ins_encode %{
+      __ mov($dst$$Register, $src$$constant);
+  %}
+  ins_pipe(loadConP_poll);
+%}
+
+instruct loadConL(iRegL dst, immL src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 4);
+  format %{ "MOV_SLOW   $dst.lo, $src & 0x0FFFFFFFFL \t! long\n\t"
+            "MOV_SLOW   $dst.hi, $src >> 32" %}
+  ins_encode %{
+    __ mov(reg_to_register_object($dst$$reg), $src$$constant & 0x0FFFFFFFFL);
+    __ mov(reg_to_register_object($dst$$reg + 1), ((julong)($src$$constant)) >> 32);
+  %}
+  ins_pipe(loadConL);
+%}
+
+instruct loadConL16( iRegL dst, immL16 src ) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 2);
+
+  size(8);
+  format %{ "MOVW    $dst.lo, $src \n\t"
+            "MOVW    $dst.hi, 0 \n\t" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant);
+    __ movw_i(reg_to_register_object($dst$$reg + 1), 0);
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct loadConF_imm8(regF dst, imm8F src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "FCONSTS      $dst, $src"%}
+
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+instruct loadConF(regF dst, immF src, iRegI tmp) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST * 2);
+  effect(TEMP tmp);
+  size(3*4);
+
+  format %{ "MOV_SLOW  $tmp, $src\n\t"
+            "FMSR      $dst, $tmp"%}
+
+  ins_encode %{
+    // FIXME revisit once 6961697 is in
+    union {
+      jfloat f;
+      int i;
+    } v;
+    v.f = $src$$constant;
+    __ mov($tmp$$Register, v.i);
+    __ vmov_f32($dst$$FloatRegister, $tmp$$Register);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+instruct loadConD_imm8(regD dst, imm8D src) %{
+  match(Set dst src);
+  ins_cost(DEFAULT_COST);
+  size(4);
+
+  format %{ "FCONSTD      $dst, $src"%}
+
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+instruct loadConD(regD dst, immD src, iRegP tmp) %{
+  match(Set dst src);
+  effect(TEMP tmp);
+  ins_cost(MEMORY_REF_COST);
+  format %{ "FLDD  $dst, [$constanttablebase + $constantoffset]\t! load from constant table: double=$src" %}
+
+  ins_encode %{
+    Register r = $constanttablebase;
+    int offset  = $constantoffset($src);
+    if (!is_memoryD(offset)) {                // can't use a predicate
+                                              // in load constant instructs
+      __ add($tmp$$Register, r, offset);
+      r = $tmp$$Register;
+      offset = 0;
+    }
+    __ vldr_f64($dst$$FloatRegister, Address(r, offset));
+  %}
+  ins_pipe(loadConFD);
+%}
+
+// Prefetch instructions.
+// Must be safe to execute with invalid address (cannot fault).
+
+instruct prefetchAlloc_mp( memoryP mem ) %{
+  predicate(VM_Version::features() & FT_MP_EXT);
+  match( PrefetchAllocation mem );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "PLDW $mem\t! Prefetch allocation" %}
+  ins_encode %{
+    __ pldw($mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct prefetchAlloc_sp( memoryP mem ) %{
+  predicate(!(VM_Version::features() & FT_MP_EXT));
+  match( PrefetchAllocation mem );
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "PLD $mem\t! Prefetch allocation" %}
+  ins_encode %{
+    __ pld($mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+//----------Store Instructions-------------------------------------------------
+// Store Byte
+instruct storeB(memoryB mem, store_RegI src) %{
+  match(Set mem (StoreB mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRB    $src,$mem\t! byte" %}
+  ins_encode %{
+    __ strb($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeCM(memoryB mem, store_RegI src) %{
+  match(Set mem (StoreCM mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRB    $src,$mem\t! CMS card-mark byte" %}
+  ins_encode %{
+    __ strb($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Char/Short
+
+instruct storeC(memoryS mem, store_RegI src) %{
+  match(Set mem (StoreC mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "STRH    $src,$mem\t! short" %}
+  ins_encode %{
+    __ strh($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Integer
+
+instruct storeI(memoryI mem, store_RegI src) %{
+  match(Set mem (StoreI mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "str $src,$mem" %}
+  ins_encode %{
+    __ str($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Long
+
+instruct storeL(memoryL mem, store_RegLd src) %{
+  predicate(!((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "strd  $src,$mem\t! long\n\t" %}
+
+  ins_encode %{
+    __ strd($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeL_2instr(memorylong mem, iRegL src) %{
+  predicate(!((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST + DEFAULT_COST);
+
+  size(8);
+  format %{ "STR    $src.lo,$mem\t! long\n\t"
+            "STR    $src.hi,$mem+4" %}
+
+  ins_encode %{
+    Address Amemlo = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp, relocInfo::none);
+    Address Amemhi = Address::make_raw($mem$$base, $mem$$index, $mem$$scale, $mem$$disp + 4, relocInfo::none);
+    __ str($src$$Register, Amemlo);
+    __ str($src$$Register->successor(), Amemhi);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeL_volatile(indirect mem, iRegL src) %{
+  predicate(((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "STMIA    $src,$mem\t! long" %}
+  ins_encode %{
+    // FIXME: why is stmia considered atomic?  Should be strexd
+    // TODO: need 3 temp registers to use atomic_strd
+    __ stmia(reg_to_register_object($mem$$base), RegSet::of($src$$Register, reg_to_register_object($src$$reg + 1)).bits(), /*wb*/false);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+instruct storeL_volatile_fp(memoryD mem, iRegL src) %{
+  predicate(((StoreLNode*)n)->require_atomic_access());
+  match(Set mem (StoreL mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(8);
+  format %{ "FMDRR    S14, $src\t! long \n\t"
+            "FSTD     S14, $mem" %}
+  ins_encode %{
+    __ vmov_f64(f14, $src$$Register, $src$$Register->successor());
+    __ vstr_f64(f14, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_reg);
+%}
+
+// Store Pointer
+
+instruct storeP(memoryP mem, store_ptr_RegP src) %{
+  match(Set mem (StoreP mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+
+  format %{ "STR    $src,$mem\t! ptr" %}
+  ins_encode %{
+    __ str($src$$Register, $mem$$Address);
+  %}
+  ins_pipe(istore_mem_spORreg);
+%}
+
+// Store Double
+
+instruct storeD(memoryD mem, regD src) %{
+  match(Set mem (StoreD mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  // FIXME: needs to be atomic, but  ARMv7 A.R.M. guarantees
+  // only LDREXD and STREXD are 64-bit single-copy atomic
+  format %{ "FSTD   $src,$mem" %}
+  ins_encode %{
+    __ vstr_f64($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+// Store Float
+
+instruct storeF( memoryF mem, regF src) %{
+  match(Set mem (StoreF mem src));
+  ins_cost(MEMORY_REF_COST);
+
+  size(4);
+  format %{ "FSTS    $src,$mem" %}
+  ins_encode %{
+    __ vstr_f32($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreF_mem_reg);
+%}
+
+//----------MemBar Instructions-----------------------------------------------
+// Memory barrier flavors
+
+// TODO: take advantage of Aarch64 load-acquire, store-release, etc
+// pattern-match out unnecessary membars
+instruct membar_storestore() %{
+  match(MemBarStoreStore);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-storestore" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore));
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_acquire() %{
+  match(MemBarAcquire);
+  match(LoadFence);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-acquire" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadLoad | MacroAssembler::LoadStore));
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_acquire_lock() %{
+  match(MemBarAcquireLock);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-acquire (CAS in prior FastLock so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+instruct membar_release() %{
+  match(MemBarRelease);
+  match(StoreFence);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-release" %}
+  ins_encode %{
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::StoreStore | MacroAssembler::LoadStore));
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct membar_release_lock() %{
+  match(MemBarReleaseLock);
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-release (CAS in succeeding FastUnlock so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+instruct membar_volatile() %{
+  match(MemBarVolatile);
+  ins_cost(4*MEMORY_REF_COST);
+
+  size(4);
+  format %{ "MEMBAR-volatile" %}
+  ins_encode %{
+    __ membar(MacroAssembler::StoreLoad);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct unnecessary_membar_volatile() %{
+  match(MemBarVolatile);
+  predicate(Matcher::post_store_load_barrier(n));
+  ins_cost(0);
+
+  size(0);
+  format %{ "!MEMBAR-volatile (unnecessary so empty encoding)" %}
+  ins_encode( );
+  ins_pipe(empty);
+%}
+
+//----------Register Move Instructions-----------------------------------------
+// instruct roundDouble_nop(regD dst) %{
+//   match(Set dst (RoundDouble dst));
+//   ins_pipe(empty);
+// %}
+
+
+// instruct roundFloat_nop(regF dst) %{
+//   match(Set dst (RoundFloat dst));
+//   ins_pipe(empty);
+// %}
+
+
+// Cast Index to Pointer for unsafe natives
+instruct castX2P(iRegX src, iRegP dst) %{
+  match(Set dst (CastX2P src));
+
+  format %{ "MOV    $dst,$src\t! IntX->Ptr if $dst != $src" %}
+  ins_encode %{
+    if ($dst$$Register !=  $src$$Register) {
+      __ mov($dst$$Register, $src$$Register);
+    }
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// Cast Pointer to Index for unsafe natives
+instruct castP2X(iRegP src, iRegX dst) %{
+  match(Set dst (CastP2X src));
+
+  format %{ "MOV    $dst,$src\t! Ptr->IntX if $dst != $src" %}
+  ins_encode %{
+    if ($dst$$Register !=  $src$$Register) {
+      __ mov($dst$$Register, $src$$Register);
+    }
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+//----------Conditional Move---------------------------------------------------
+// Conditional move
+instruct cmovIP_reg(cmpOpP cmp, flagsRegP pcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIP_immMov(cmpOpP cmp, flagsRegP pcc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIP_imm16(cmpOpP cmp, flagsRegP pcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVw$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovI_reg(cmpOp cmp, flagsReg icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovI_immMov(cmpOp cmp, flagsReg icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovII_imm16(cmpOp cmp, flagsReg icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVw$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovII_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovII_immMov_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovII_imm16_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+  size(4);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIIu_reg(cmpOpU cmp, flagsRegU icc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIIu_immMov(cmpOpU cmp, flagsRegU icc, iRegI dst, immIMov src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIIu_imm16(cmpOpU cmp, flagsRegU icc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// Conditional move
+instruct cmovPP_reg(cmpOpP cmp, flagsRegP pcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPP_imm(cmpOpP cmp, flagsRegP pcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// This instruction also works with CmpN so we don't need cmovPN_reg.
+instruct cmovPI_reg(cmpOp cmp, flagsReg icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPIu_reg(cmpOpU cmp, flagsRegU icc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPI_imm(cmpOp cmp, flagsReg icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPI_imm_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPIu_imm(cmpOpU cmp, flagsRegU icc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(4);
+  format %{ "MOV$cmp  $dst,$src\t! ptr" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// Conditional move
+instruct cmovFP_reg(cmpOpP cmp, flagsRegP pcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFI_reg(cmpOp cmp, flagsReg icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFIu_reg(cmpOpU cmp, flagsRegU icc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+// Conditional move
+instruct cmovDP_reg(cmpOpP cmp, flagsRegP pcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDI_reg(cmpOp cmp, flagsReg icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+instruct cmovDIu_reg(cmpOpU cmp, flagsRegU icc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_double_move);
+%}
+
+// Conditional move
+instruct cmovLP_reg(cmpOpP cmp, flagsRegP pcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLP_immRot(cmpOpP cmp, flagsRegP pcc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, (long)$src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLP_imm16(cmpOpP cmp, flagsRegP pcc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp pcc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_reg(cmpOp cmp, flagsReg icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLI_reg_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLI_immRot(cmpOp cmp, flagsReg icc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct cmovLI_immRot_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, immLlowRot src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_imm16(cmpOp cmp, flagsReg icc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ movw_i($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLI_imm16_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, iRegL dst, immL16 src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt ||
+            _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+  ins_cost(140);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src\t! long\n\t"
+            "MOV$cmp  $dst.hi,0" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+    __ movw_i($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLIu_reg(cmpOpU cmp, flagsRegU icc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp icc) (Binary dst src)));
+  ins_cost(150);
+
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+
+//----------OS and Locking Instructions----------------------------------------
+
+// This name is KNOWN by the ADLC and cannot be changed.
+// The ADLC forces a 'TypeRawPtr::BOTTOM' output type
+// for this guy.
+instruct tlsLoadP(RthreadRegP dst) %{
+  match(Set dst (ThreadLocal));
+
+  size(0);
+  ins_cost(0);
+  format %{ "! TLS is in $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(ialu_none);
+%}
+
+instruct checkCastPP( iRegP dst ) %{
+  match(Set dst (CheckCastPP dst));
+
+  size(0);
+  format %{ "! checkcastPP of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(empty);
+%}
+
+
+instruct castPP( iRegP dst ) %{
+  match(Set dst (CastPP dst));
+  format %{ "! castPP of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_pipe(empty);
+%}
+
+instruct castII( iRegI dst ) %{
+  match(Set dst (CastII dst));
+  format %{ "! castII of $dst" %}
+  ins_encode( /*empty encoding*/ );
+  ins_cost(0);
+  ins_pipe(empty);
+%}
+
+//----------Arithmetic Instructions--------------------------------------------
+// Addition Instructions
+// Register Addition
+instruct addI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "add_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1<<$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, lsl($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addshlI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1<<$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, lsl($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, asr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addsarI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, asr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AddI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>>$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct addshrI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (AddI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "add_32 $dst,$src3,$src1>>>$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src3$$Register, $src1$$Register, lsr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Addition
+instruct addI_reg_aimmI(iRegI dst, iRegI src1, aimmI src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "add_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Pointer Register Addition
+instruct addP_reg_reg(iRegP dst, iRegP src1, iRegX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// shifted iRegX operand
+operand shiftedX(iRegX src2, shimmX src3) %{
+//constraint(ALLOC_IN_RC(sp_ptr_reg));
+  match(LShiftX src2 src3);
+
+  op_cost(1);
+  format %{ "$src2 << $src3" %}
+  interface(MEMORY_INTER) %{
+    base($src2);
+    index(0xff);
+    scale($src3);
+    disp(0x0);
+  %}
+%}
+
+instruct addshlP_reg_reg_imm(iRegP dst, iRegP src1, shiftedX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  ins_cost(DEFAULT_COST * 3/2);
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    Register base = reg_to_register_object($src2$$base);
+    __ add($dst$$Register, $src1$$Register, base, lsl($src2$$scale));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Pointer Immediate Addition
+instruct addP_reg_aimmX(iRegP dst, iRegP src1, aimmX src2) %{
+  match(Set dst (AddP src1 src2));
+
+  size(4);
+  format %{ "ADD    $dst,$src1,$src2\t! ptr" %}
+  ins_encode %{
+    __ add($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Long Addition
+instruct addL_reg_reg(iRegL dst, iRegL src1, iRegL src2, flagsReg ccr) %{
+  match(Set dst (AddL src1 src2));
+  effect(KILL ccr);
+  ins_cost(DEFAULT_COST*2);
+  size(8);
+  format %{ "ADDS    $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "ADC     $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ adds($dst$$Register, $src1$$Register, $src2$$Register);
+    __ adc($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct addL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con, flagsReg ccr) %{
+  match(Set dst (AddL src1 con));
+  effect(KILL ccr);
+  size(8);
+  format %{ "ADDS    $dst.lo,$src1.lo,$con\t! long\n\t"
+            "ADC     $dst.hi,$src1.hi,0" %}
+  ins_encode %{
+    __ adds($dst$$Register, $src1$$Register, (long)$con$$constant);
+    __ adc($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+//----------Conditional_store--------------------------------------------------
+// Conditional-store of the updated heap-top.
+// Used during allocation of the shared heap.
+// Sets flags (EQ) on success.
+
+// TODO: optimize out barriers with AArch64 load-acquire/store-release
+// LoadP-locked.
+instruct loadPLocked(iRegP dst, memoryex mem) %{
+  match(Set dst (LoadPLocked mem));
+  size(4);
+  format %{ "LDREX  $dst,$mem" %}
+  ins_encode %{
+    __ ldrex($dst$$Register,$mem$$Address);
+  %}
+  ins_pipe(iload_mem);
+%}
+
+instruct storePConditional( memoryex heap_top_ptr, iRegP oldval, iRegP newval, iRegI tmp, flagsRegP pcc ) %{
+  predicate(_kids[1]->_kids[0]->_leaf->Opcode() == Op_LoadPLocked); // only works in conjunction with a LoadPLocked node
+  match(Set pcc (StorePConditional heap_top_ptr (Binary oldval newval)));
+  effect( TEMP tmp );
+  size(8);
+  format %{ "STREX  $tmp,$newval,$heap_top_ptr\n\t"
+            "CMP    $tmp, 0" %}
+  ins_encode %{
+    __ strex($tmp$$Register, $newval$$Register, $heap_top_ptr$$Address);
+    __ cmp($tmp$$Register, 0);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// Conditional-store of an intx value.
+instruct storeXConditional( memoryex mem, iRegX oldval, iRegX newval, iRegX tmp, flagsReg icc ) %{
+  match(Set icc (StoreIConditional mem (Binary oldval newval)));
+  effect( TEMP tmp );
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem], DOESN'T set $newval=[$mem] in any case\n\t"
+            "XORS     $tmp,$tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "CMP.eq   $tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "TEQ      $tmp, 0\n\t"
+            "membar   LoadStore|LoadLoad" %}
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp$$Register, $mem$$Address);
+    __ eors($tmp$$Register, $tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, Assembler::EQ);
+    __ cmp($tmp$$Register, 1, Assembler::EQ);
+    __ b(loop, Assembler::EQ);
+    __ teq($tmp$$Register, 0);
+    // used by biased locking only. Requires a membar.
+    __ membar(MacroAssembler::Membar_mask_bits(MacroAssembler::LoadStore | MacroAssembler::LoadLoad));
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// No flag versions for CompareAndSwap{P,I,L} because matcher can't match them
+
+instruct compareAndSwapL_bool(memoryex mem, iRegL oldval, iRegLd newval, iRegI res, iRegLd tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapL mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(32);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp.lo, $oldval.lo\n\t"
+            "CMP.eq   $tmp.hi, $oldval.hi\n\t"
+            "STREXD.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "XORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ cmp($tmp$$Register->successor(), $oldval$$Register->successor(), Assembler::EQ);
+    __ strexd($tmp$$Register, $newval$$Register, $mem$$Address, Assembler::EQ);
+    __ mov($tmp$$Register, 0, Assembler::NE);
+    __ eors($tmp$$Register, $tmp$$Register, 1, Assembler::EQ);
+    __ b(loop, Assembler::EQ);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+
+instruct compareAndSwapI_bool(memoryex mem, iRegI oldval, iRegI newval, iRegI res, iRegI tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapI mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "XORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp$$Register,$mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, Assembler::EQ);
+    __ mov($tmp$$Register, 0, Assembler::NE);
+    __ eors($tmp$$Register, $tmp$$Register, 1, Assembler::EQ);
+    __ b(loop, Assembler::EQ);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct compareAndSwapP_bool(memoryex mem, iRegP oldval, iRegP newval, iRegI res, iRegI tmp, flagsReg ccr ) %{
+  match(Set res (CompareAndSwapP mem (Binary oldval newval)));
+  effect( KILL ccr, TEMP tmp);
+  size(28);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp, $mem\t! If $oldval==[$mem] Then store $newval into [$mem]\n\t"
+            "CMP      $tmp, $oldval\n\t"
+            "STREX.eq $tmp, $newval, $mem\n\t"
+            "MOV.ne   $tmp, 0 \n\t"
+            "EORS.eq  $tmp,$tmp, 1 \n\t"
+            "B.eq     loop \n\t"
+            "MOV      $res, $tmp" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp$$Register,$mem$$Address);
+    __ cmp($tmp$$Register, $oldval$$Register);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address, Assembler::EQ);
+    __ mov($tmp$$Register, 0, Assembler::NE);
+    __ eors($tmp$$Register, $tmp$$Register, 1, Assembler::EQ);
+    __ b(loop, Assembler::EQ);
+    __ mov($res$$Register, $tmp$$Register);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddI_aimmI_no_res(memoryex mem, aimmI add, Universe dummy, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp1$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$constant);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddI_reg_no_res(memoryex mem, iRegI add, Universe dummy, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $tmp1, $mem\n\t"
+            "ADD      $tmp1, $tmp1, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($tmp1$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddI_aimmI(memoryex mem, aimmI add, iRegI res, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $res$$Register, $add$$constant);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddI_reg(memoryex mem, iRegI add, iRegI res, iRegI tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddI mem add));
+  effect(KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(20);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "ADD      $tmp1, $res, $add\n\t"
+            "STREX    $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ add($tmp1$$Register, $res$$Register, $add$$Register);
+    __ strex($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddL_reg_no_res(memoryex mem, iRegL add, Universe dummy, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp1, $mem\n\t"
+            "ADDS     $tmp1.lo, $tmp1.lo, $add.lo\n\t"
+            "ADC      $tmp1.hi, $tmp1.hi, $add.hi\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp1$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $tmp1$$Register, $add$$Register);
+    __ adc($tmp1$$Register->successor(), $tmp1$$Register->successor(), $add$$Register->successor());
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xaddL_immRot_no_res(memoryex mem, immLlowRot add, Universe dummy, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  predicate(n->as_LoadStore()->result_not_used());
+  match(Set dummy (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $tmp1, $mem\n\t"
+            "ADDS     $tmp1.lo, $tmp1.lo, $add\n\t"
+            "ADC      $tmp1.hi, $tmp1.hi, 0\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($tmp1$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $tmp1$$Register, (long)$add$$constant);
+    __ adc($tmp1$$Register->successor(), $tmp1$$Register->successor(), 0);
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xaddL_reg(memoryex mem, iRegL add, iRegLd res, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "ADDS     $tmp1.lo, $res.lo, $add.lo\n\t"
+            "ADC      $tmp1.hi, $res.hi, $add.hi\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $res$$Register, $add$$Register);
+    __ adc($tmp1$$Register->successor(), $res$$Register->successor(), $add$$Register->successor());
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xaddL_immRot(memoryex mem, immLlowRot add, iRegLd res, iRegLd tmp1, iRegI tmp2, flagsReg ccr) %{
+  match(Set res (GetAndAddL mem add));
+  effect( KILL ccr, TEMP tmp1, TEMP tmp2, TEMP res);
+  size(24);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "ADDS     $tmp1.lo, $res.lo, $add\n\t"
+            "ADC      $tmp1.hi, $res.hi, 0\n\t"
+            "STREXD   $tmp2, $tmp1, $mem\n\t"
+            "CMP      $tmp2, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ adds($tmp1$$Register, $res$$Register, (long)$add$$constant);
+    __ adc($tmp1$$Register->successor(), $res$$Register->successor(), 0);
+    __ strexd($tmp2$$Register, $tmp1$$Register, $mem$$Address);
+    __ cmp($tmp2$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xchgI(memoryex mem, iRegI newval, iRegI res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetI mem newval));
+  effect(KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "STREX    $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xchgL(memoryex mem, iRegLd newval, iRegLd res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetL mem newval));
+  effect( KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREXD   $res, $mem\n\t"
+            "STREXD   $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrexd($res$$Register, $mem$$Address);
+    __ strexd($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+instruct xchgP(memoryex mem, iRegP newval, iRegP res, iRegI tmp, flagsReg ccr) %{
+  match(Set res (GetAndSetP mem newval));
+  effect(KILL ccr, TEMP tmp, TEMP res);
+  size(16);
+  format %{ "loop: \n\t"
+            "LDREX    $res, $mem\n\t"
+            "STREX    $tmp, $newval, $mem\n\t"
+            "CMP      $tmp, 0 \n\t"
+            "B.ne     loop \n\t" %}
+
+  ins_encode %{
+    Label loop;
+    __ bind(loop);
+    __ ldrex($res$$Register,$mem$$Address);
+    __ strex($tmp$$Register, $newval$$Register, $mem$$Address);
+    __ cmp($tmp$$Register, 0);
+    __ b(loop, Assembler::NE);
+  %}
+  ins_pipe( long_memory_op );
+%}
+
+//---------------------
+// Subtraction Instructions
+// Register Subtraction
+instruct subI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2<<$src3\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2>>$src3\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "SUB    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct subshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (SubI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2>>>$src3\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1<<$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, lsl($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshlI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (LShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1<<$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, lsl($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, asr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbsarI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (RShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, asr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (SubI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct rsbshrI_reg_imm_reg(iRegI dst, iRegI src1, immU5 src2, iRegI src3) %{
+  match(Set dst (SubI (URShiftI src1 src2) src3));
+
+  size(4);
+  format %{ "RSB    $dst,$src3,$src1>>>$src2" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src3$$Register, $src1$$Register, lsr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Subtraction
+instruct subI_reg_aimmI(iRegI dst, iRegI src1, aimmI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct subI_reg_immRotneg(iRegI dst, iRegI src1, aimmIneg src2) %{
+  match(Set dst (AddI src1 src2));
+
+  size(4);
+  format %{ "sub_32 $dst,$src1,-($src2)\t! int" %}
+  ins_encode %{
+    __ sub($dst$$Register, $src1$$Register, -$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct subI_immRot_reg(iRegI dst, immIRot src1, iRegI src2) %{
+  match(Set dst (SubI src1 src2));
+
+  size(4);
+  format %{ "RSB    $dst,$src2,src1" %}
+  ins_encode %{
+    __ rsb($dst$$Register, $src2$$Register, $src1$$constant);
+  %}
+  ins_pipe(ialu_zero_reg);
+%}
+
+// Register Subtraction
+instruct subL_reg_reg(iRegL dst, iRegL src1, iRegL src2, flagsReg icc ) %{
+  match(Set dst (SubL src1 src2));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "SUBS   $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "SBC    $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ subs($dst$$Register, $src1$$Register, $src2$$Register);
+    __ sbc($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Subtraction
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct subL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con, flagsReg icc) %{
+  match(Set dst (SubL src1 con));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "SUB    $dst.lo,$src1.lo,$con\t! long\n\t"
+            "SBC    $dst.hi,$src1.hi,0" %}
+  ins_encode %{
+    __ subs($dst$$Register, $src1$$Register, (long)$con$$constant);
+    __ sbc($dst$$Register->successor(), $src1$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Long negation
+instruct negL_reg_reg(iRegL dst, immL0 zero, iRegL src2, flagsReg icc) %{
+  match(Set dst (SubL zero src2));
+  effect (KILL icc);
+
+  size(8);
+  format %{ "RSBS   $dst.lo,$src2.lo,0\t! long\n\t"
+            "RSC    $dst.hi,$src2.hi,0" %}
+  ins_encode %{
+    __ rsbs($dst$$Register, $src2$$Register, 0);
+    __ rsc($dst$$Register->successor(), $src2$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_zero_reg);
+%}
+
+// Multiplication Instructions
+// Integer Multiplication
+// Register Multiplication
+instruct mulI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (MulI src1 src2));
+
+  ins_cost(DEFAULT_COST);
+  size(4);
+  format %{ "mul_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mul($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_lo1_hi2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  ins_cost(DEFAULT_COST);
+  size(4);
+  format %{ "MUL  $dst.hi,$src1.lo,$src2.hi\t! long" %}
+  ins_encode %{
+    __ mul($dst$$Register->successor(), $src1$$Register, $src2$$Register->successor());
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_hi1_lo2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  ins_cost(DEFAULT_COST*3/2);
+  size(8);
+  format %{ "MLA  $dst.hi,$src1.hi,$src2.lo,$dst.hi\t! long\n\t"
+            "MOV  $dst.lo, 0"%}
+  ins_encode %{
+    __ mla($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register, $dst$$Register->successor());
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_lo1_lo2(iRegL dst, iRegL src1, iRegL src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "UMLAL  $dst.lo,$dst.hi,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ umlal($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct mulL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (MulL src1 src2));
+  ins_cost(DEFAULT_COST*8/2);
+
+  expand %{
+    mulL_lo1_hi2(dst, src1, src2);
+    mulL_hi1_lo2(dst, src1, src2);
+    mulL_lo1_lo2(dst, src1, src2);
+  %}
+%}
+
+instruct mla_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI srcA) %{
+  match(Set dst (AddI (MulI src1 src2) srcA));
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "MLA $dst,$src1,$src2,$srcA" %}
+  ins_encode %{
+    __ mla($dst$$Register, $src1$$Register, $src2$$Register, $srcA$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct mls_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI srcA) %{
+  match(Set dst (SubI srcA (MulI src1 src2)));
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "MLS $dst,$src1,$src2,$srcA" %}
+  ins_encode %{
+    __ mls($dst$$Register, $src1$$Register, $src2$$Register, $srcA$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct smlal_reg_reg_reg(iRegL dst, iRegI src1, iRegI src2) %{
+  match(Set dst (AddL (MulL (ConvI2L src1) (ConvI2L src2)) dst));
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "SMLAL $dst.lo,$dst.hi,$src1,$src2" %}
+  ins_encode %{
+    __ smlal($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct smull_reg_reg_reg(iRegL dst, iRegI src1, iRegI src2) %{
+  match(Set dst (MulL (ConvI2L src1) (ConvI2L src2)));
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "SMULL $dst.lo,$dst.hi,$src1,$src2" %}
+  ins_encode %{
+    __ smull($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Integer Division
+// Register Division
+instruct divI_reg_reg_IDIV(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (DivI src1 src2));
+  predicate(VM_Version::features() & FT_HW_DIVIDE);
+  ins_cost(2*DEFAULT_COST);
+
+  format %{ "SDIV   $dst,$src1,$src2"%}
+  ins_encode %{
+    __ sdiv($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(sdiv_reg_reg_IDIV);
+%}
+
+instruct divI_reg_reg_SW(R0RegI dst, R1RegI src1, R2RegI src2, R9RegI temp1, R12RegI temp2, LRRegP lr, flagsReg ccr) %{
+  match(Set dst (DivI src1 src2));
+  predicate(!(VM_Version::features() & FT_HW_DIVIDE));
+  effect( KILL ccr, TEMP temp1, TEMP temp2, USE_KILL src1,USE_KILL src2, KILL lr);
+  ins_cost((2+71)*DEFAULT_COST);
+
+  format %{ "DIV   $dst,$src1,$src2 ! call to StubRoutines::aarch32::idiv_entry()" %}
+  ins_encode %{
+    __ call(StubRoutines::aarch32::idiv_entry(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(sdiv_reg_reg_SW);
+%}
+
+// Register Long Division
+instruct divL_reg_reg(R0R1RegL dst, R2R3RegL src1, R0R1RegL src2) %{
+  match(Set dst (DivL src1 src2));
+  effect(CALL);
+  ins_cost(DEFAULT_COST*71);
+  format %{ "DIVL  $src1,$src2,$dst\t! long ! call to SharedRuntime::ldiv" %}
+  ins_encode %{
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::ldiv);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(divL_reg_reg);
+%}
+
+// Integer Remainder
+// Register Remainder
+instruct modI_reg_reg_IDIV(iRegI dst, iRegI src1, iRegI src2, iRegI temp) %{
+  match(Set dst (ModI src1 src2));
+  predicate(VM_Version::features() & FT_HW_DIVIDE);
+  effect( TEMP temp);
+
+  format %{ "SDIV   $temp,$src1,$src2\n\t"
+            "MLS    $dst, $temp, $src2, $src1"%}
+  ins_encode %{
+    __ sdiv($temp$$Register, $src1$$Register, $src2$$Register);
+    __ mls($dst$$Register, $temp$$Register, $src2$$Register, $src1$$Register);
+  %}
+  ins_pipe(sdiv_reg_reg_IDIV);
+%}
+
+instruct modI_reg_reg_SW(R0RegI dst, R1RegI src1, R2RegI src2, R9RegI temp1, R12RegI temp2, LRRegP lr, flagsReg ccr ) %{
+  match(Set dst (ModI src1 src2));
+  predicate(!(VM_Version::features() & FT_HW_DIVIDE));
+  effect( KILL ccr, TEMP temp1, TEMP temp2, KILL lr, USE_KILL src1, USE_KILL src2);
+
+  format %{ "MODI   $dst,$src1,$src2\t ! call to StubRoutines::aarch32::irem_entry" %}
+  ins_encode %{
+    __ call(StubRoutines::aarch32::irem_entry(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(sdiv_reg_reg_SW);
+%}
+
+// Register Long Remainder
+instruct modL_reg_reg(R0R1RegL dst, R2R3RegL src1, R0R1RegL src2) %{
+  match(Set dst (ModL src1 src2));
+  effect(CALL);
+  ins_cost(MEMORY_REF_COST); // FIXME
+  format %{ "modL    $dst,$src1,$src2\t ! call to SharedRuntime::lrem" %}
+  ins_encode %{
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::lrem);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(divL_reg_reg);
+%}
+
+// Integer Shift Instructions
+
+// Register Shift Left
+instruct shlI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (LShiftI src1 src2));
+
+  size(4);
+  format %{ "LSL  $dst,$src1,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, lsl($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Shift Left Immediate
+instruct shlI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (LShiftI src1 src2));
+
+  size(4);
+  format %{ "LSL    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ lsl($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct shlL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{"OR  $dst.hi,$dst.hi,($src1.hi << $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register->successor(), $dst$$Register->successor(), $src1$$Register->successor(), lsl($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shlL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "LSL  $dst.lo,$src1.lo,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, lsl($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shlL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst.hi,$src2,32 \n\t"
+            "LSLpl $dst.hi,$src1.lo,$dst.hi \n\t"
+            "RSBmi $dst.hi,$dst.hi,0 \n\t"
+            "LSRmi $dst.hi,$src1.lo,$dst.hi" %}
+
+  ins_encode %{
+    // $src1$$Register and $dst$$Register->successor() can't be the same
+    __ subs($dst$$Register->successor(), $src2$$Register, 32);
+    __ mov($dst$$Register->successor(), $src1$$Register, lsl($dst$$Register->successor()), Assembler::PL);
+    __ rsb($dst$$Register->successor(), $dst$$Register->successor(), 0, Assembler::MI);
+    __ mov($dst$$Register->successor(), $src1$$Register, lsr($dst$$Register->successor()), Assembler::MI);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shlL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  expand %{
+    flagsReg ccr;
+    shlL_reg_reg_overlap(dst, src1, src2, ccr);
+    shlL_reg_reg_merge_hi(dst, src1, src2);
+    shlL_reg_reg_merge_lo(dst, src1, src2);
+  %}
+%}
+
+// Register Shift Left Immediate
+instruct shlL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  size(8);
+  format %{ "LSL   $dst.hi,$src1.lo,$src2-32\t! or mov if $src2==32\n\t"
+            "MOV   $dst.lo, 0" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register->successor(), $src1$$Register);
+    } else {
+      __ mov($dst$$Register->successor(), $src1$$Register, lsl($src2$$constant-32));
+    }
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct shlL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (LShiftL src1 src2));
+
+  size(12);
+  format %{ "LSL   $dst.hi,$src1.lo,$src2\n\t"
+            "OR    $dst.hi, $dst.hi, $src1.lo >> 32-$src2\n\t"
+            "LSL   $dst.lo,$src1.lo,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), lsl($src2$$constant));
+    __ orr($dst$$Register->successor(), $dst$$Register->successor(), $src1$$Register, lsr(32-$src2$$constant));
+    __ mov($dst$$Register, $src1$$Register, lsl($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Arithmetic Shift Right
+instruct sarI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (RShiftI src1 src2));
+  size(4);
+  format %{ "ASR    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, asr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Arithmetic Shift Right Immediate
+instruct sarI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (RShiftI src1 src2));
+
+  size(4);
+  format %{ "ASR    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, asr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Shift Right Arithmetic Long
+instruct sarL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "OR  $dst.lo,$dst.lo,($src1.lo >> $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register, $dst$$Register, $src1$$Register, lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct sarL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "ASR  $dst.hi,$src1.hi,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), asr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct sarL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst.lo,$src2,32 \n\t"
+            "ASRpl $dst.lo,$src1.hi,$dst.lo \n\t"
+            "RSBmi $dst.lo,$dst.lo,0 \n\t"
+            "LSLmi $dst.lo,$src1.hi,$dst.lo" %}
+
+  ins_encode %{
+    // $src1$$Register->successor() and $dst$$Register can't be the same
+    __ subs($dst$$Register, $src2$$Register, 32);
+    __ mov($dst$$Register, $src1$$Register->successor(), asr($dst$$Register), Assembler::PL);
+    __ rsb($dst$$Register, $dst$$Register, 0, Assembler::MI);
+    __ mov($dst$$Register, $src1$$Register->successor(), lsl($dst$$Register), Assembler::MI);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct sarL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (RShiftL src1 src2));
+
+  expand %{
+    flagsReg ccr;
+    sarL_reg_reg_overlap(dst, src1, src2, ccr);
+    sarL_reg_reg_merge_lo(dst, src1, src2);
+    sarL_reg_reg_merge_hi(dst, src1, src2);
+  %}
+%}
+
+// Register Shift Left Immediate
+instruct sarL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (RShiftL src1 src2));
+
+  size(8);
+  format %{ "ASR   $dst.lo,$src1.hi,$src2-32\t! or mov if $src2==32\n\t"
+            "ASR   $dst.hi,$src1.hi, $src2" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register, $src1$$Register->successor());
+    } else{
+      __ mov($dst$$Register, $src1$$Register->successor(), asr($src2$$constant-32));
+    }
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), asr(32));
+  %}
+
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct sarL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (RShiftL src1 src2));
+  size(12);
+  format %{ "LSR   $dst.lo,$src1.lo,$src2\n\t"
+            "OR    $dst.lo, $dst.lo, $src1.hi << 32-$src2\n\t"
+            "ASR   $dst.hi,$src1.hi,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register, $src1$$Register, lsr($src2$$constant));
+    __ orr($dst$$Register, $dst$$Register, $src1$$Register->successor(), lsl(32-$src2$$constant));
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), asr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Shift Right
+instruct shrI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (URShiftI src1 src2));
+  size(4);
+  format %{ "LSR    $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Register Shift Right Immediate
+instruct shrI_reg_imm5(iRegI dst, iRegI src1, immU5 src2) %{
+  match(Set dst (URShiftI src1 src2));
+
+  size(4);
+  format %{ "LSR    $dst,$src1,$src2" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src1$$Register, lsr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Shift Right
+instruct shrL_reg_reg_merge_lo(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "OR   $dst.lo,$dst,($src1.lo >>> $src2)"  %}
+  ins_encode %{
+    __ orr($dst$$Register, $dst$$Register, $src1$$Register, lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shrL_reg_reg_merge_hi(iRegL dst, iRegL src1, iRegI src2) %{
+  effect(USE_DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "LSR  $dst.hi,$src1.hi,$src2 \n\t" %}
+  ins_encode %{
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), lsr($src2$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shrL_reg_reg_overlap(iRegL dst, iRegL src1, iRegI src2, flagsReg ccr) %{
+  effect(DEF dst, USE src1, USE src2, KILL ccr);
+  size(16);
+  format %{ "SUBS  $dst,$src2,32 \n\t"
+            "LSRpl $dst,$src1.hi,$dst \n\t"
+            "RSBmi $dst,$dst,0 \n\t"
+            "LSLmi $dst,$src1.hi,$dst" %}
+
+  ins_encode %{
+    // $src1$$Register->successor() and $dst$$Register can't be the same
+    __ subs($dst$$Register, $src2$$Register, 32);
+    __ mov($dst$$Register, $src1$$Register->successor(), lsr($dst$$Register), Assembler::PL);
+    __ rsb($dst$$Register, $dst$$Register, 0, Assembler::MI);
+    __ mov($dst$$Register, $src1$$Register->successor(), lsl($dst$$Register), Assembler::MI);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct shrL_reg_reg(iRegL dst, iRegL src1, iRegI src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  expand %{
+    flagsReg ccr;
+    shrL_reg_reg_overlap(dst, src1, src2, ccr);
+    shrL_reg_reg_merge_lo(dst, src1, src2);
+    shrL_reg_reg_merge_hi(dst, src1, src2);
+  %}
+%}
+
+// Register Shift Right Immediate
+instruct shrL_reg_imm6(iRegL dst, iRegL src1, immU6Big src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  size(8);
+  format %{ "LSR   $dst.lo,$src1.hi,$src2-32\t! or mov if $src2==32\n\t"
+            "MOV   $dst.hi, 0" %}
+  ins_encode %{
+    if ($src2$$constant == 32) {
+      __ mov($dst$$Register, $src1$$Register->successor());
+    } else {
+      __ mov($dst$$Register, $src1$$Register->successor(), lsr($src2$$constant-32));
+    }
+    __ mov($dst$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct shrL_reg_imm5(iRegL dst, iRegL src1, immU5 src2) %{
+  match(Set dst (URShiftL src1 src2));
+
+  size(12);
+  format %{ "LSR   $dst.lo,$src1.lo,$src2\n\t"
+            "OR    $dst.lo, $dst.lo, $src1.hi << 32-$src2\n\t"
+            "LSR   $dst.hi,$src1.hi,$src2" %}
+  ins_encode %{
+    // The order of the following 3 instructions matters: src1.lo and
+    // dst.hi can't overlap but src.hi and dst.hi can.
+    __ mov($dst$$Register, $src1$$Register, lsr($src2$$constant));
+    __ orr($dst$$Register, $dst$$Register, $src1$$Register->successor(), lsl(32-$src2$$constant));
+    __ mov($dst$$Register->successor(), $src1$$Register->successor(), lsr($src2$$constant));
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+
+instruct shrP_reg_imm5(iRegX dst, iRegP src1, immU5 src2) %{
+  match(Set dst (URShiftI (CastP2X src1) src2));
+  size(4);
+  format %{ "LSR    $dst,$src1,$src2\t! Cast ptr $src1 to int and shift" %}
+  ins_encode %{
+    __ lsr($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Overcomplicated unsigned math
+instruct umull_lreg32_lreg32(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (MulL src1 src2));
+  predicate(n->in(1)->Opcode() == Op_AndL && (((unsigned long long)n->in(1)->in(2)->find_long_con(-1))>>32)==0 &&
+            n->in(2)->Opcode() == Op_AndL && (((unsigned long long)n->in(2)->in(2)->find_long_con(-1))>>32)==0);
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "UMULL $dst.lo,$dst.hi,$src1.lo,$src2.lo" %}
+  ins_encode %{
+    __ umull($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(imul_reg_reg);
+%}
+
+instruct umlal_reg32_reg32(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (AddL dst (MulL src1 src2)));
+  predicate(
+    n->in(2)->Opcode() == Op_MulL ?
+    n->in(2)->in(1)->Opcode() == Op_AndL && (((unsigned long long)n->in(2)->in(1)->in(2)->find_long_con(-1))>>32)==0 &&
+    n->in(2)->in(2)->Opcode() == Op_AndL && (((unsigned long long)n->in(2)->in(2)->in(2)->find_long_con(-1))>>32)==0 :
+    n->in(1)->in(1)->Opcode() == Op_AndL && (((unsigned long long)n->in(1)->in(1)->in(2)->find_long_con(-1))>>32)==0 &&
+    n->in(1)->in(2)->Opcode() == Op_AndL && (((unsigned long long)n->in(1)->in(2)->in(2)->find_long_con(-1))>>32)==0
+    );
+
+  ins_cost(DEFAULT_COST*3/2);
+  size(4);
+  format %{ "UMLAL $dst.lo,$dst.hi,$src1.lo,$src2.lo" %}
+  ins_encode %{
+    __ umlal($dst$$Register, $dst$$Register->successor(), $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+//----------Floating Point Arithmetic Instructions-----------------------------
+
+//  Add float single precision
+instruct addF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (AddF src1 src2));
+
+  size(4);
+  format %{ "FADDS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vadd_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(faddF_reg_reg);
+%}
+
+//  Add float double precision
+instruct addD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (AddD src1 src2));
+
+  size(4);
+  format %{ "FADDD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vadd_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(faddD_reg_reg);
+%}
+
+//  Sub float single precision
+instruct subF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (SubF src1 src2));
+
+  size(4);
+  format %{ "FSUBS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vsub_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg_reg);
+%}
+
+//  Sub float double precision
+instruct subD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (SubD src1 src2));
+
+  size(4);
+  format %{ "FSUBD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vsub_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+//  Mul float single precision
+instruct mulF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (MulF src1 src2));
+
+  size(4);
+  format %{ "FMULS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vmul_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fmulF_reg_reg);
+%}
+
+//  Mul float double precision
+instruct mulD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (MulD src1 src2));
+
+  size(4);
+  format %{ "FMULD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vmul_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fmulD_reg_reg);
+%}
+
+//  Div float single precision
+instruct divF_reg_reg(regF dst, regF src1, regF src2) %{
+  match(Set dst (DivF src1 src2));
+
+  size(4);
+  format %{ "FDIVS  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vdiv_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fdivF_reg_reg);
+%}
+
+//  Div float double precision
+instruct divD_reg_reg(regD dst, regD src1, regD src2) %{
+  match(Set dst (DivD src1 src2));
+
+  size(4);
+  format %{ "FDIVD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vdiv_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+
+  ins_pipe(fdivD_reg_reg);
+%}
+
+//  Absolute float double precision
+instruct absD_reg(regD dst, regD src) %{
+  match(Set dst (AbsD src));
+
+  size(4);
+  format %{ "FABSd  $dst,$src" %}
+  ins_encode %{
+    __ vabs_f64($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg);
+%}
+
+//  Absolute float single precision
+instruct absF_reg(regF dst, regF src) %{
+  match(Set dst (AbsF src));
+  format %{ "FABSs  $dst,$src" %}
+  ins_encode %{
+    __ vabs_f32($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg);
+%}
+
+instruct negF_reg(regF dst, regF src) %{
+  match(Set dst (NegF src));
+
+  size(4);
+  format %{ "FNEGs  $dst,$src" %}
+  ins_encode %{
+    __ vneg_f32($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddF_reg);
+%}
+
+instruct negD_reg(regD dst, regD src) %{
+  match(Set dst (NegD src));
+
+  format %{ "FNEGd  $dst,$src" %}
+  ins_encode %{
+    __ vneg_f64($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg);
+%}
+
+//  Sqrt float double precision
+instruct sqrtF_reg_reg(regF dst, regF src) %{
+  match(Set dst (ConvD2F (SqrtD (ConvF2D src))));
+
+  size(4);
+  format %{ "FSQRTS $dst,$src" %}
+  ins_encode %{
+    __ vsqrt_f32($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fdivF_reg_reg);
+%}
+
+//  Sqrt float double precision
+instruct sqrtD_reg_reg(regD dst, regD src) %{
+  match(Set dst (SqrtD src));
+
+  size(4);
+  format %{ "FSQRTD $dst,$src" %}
+  ins_encode %{
+    __ vsqrt_f64($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fdivD_reg_reg);
+%}
+
+//----------Logical Instructions-----------------------------------------------
+// And Instructions
+// Register And
+instruct andI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (AndI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "AND    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct andshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (AndI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate And
+instruct andI_reg_limm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "and_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+instruct andI_reg_limmn(iRegI dst, iRegI src1, limmIn src2) %{
+  match(Set dst (AndI src1 src2));
+
+  size(4);
+  format %{ "bic    $dst,$src1,~$src2\t! int" %}
+  ins_encode %{
+    __ bic($dst$$Register, $src1$$Register, ~$src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register And Long
+instruct andL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (AndL src1 src2));
+
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "AND    $dst,$src1,$src2\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $src2$$Register);
+    __ andr($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct andL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (AndL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "AND    $dst,$src1,$con\t! long" %}
+  ins_encode %{
+    __ andr($dst$$Register, $src1$$Register, $con$$constant);
+    __ andr($dst$$Register->successor(), $src1$$Register->successor(), 0u);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Or Instructions
+// Register Or
+instruct orI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (OrI src1 src2));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2\t! int" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (OrI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "OR    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct orshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (OrI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "orr_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Or
+instruct orI_reg_limm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (OrI src1 src2));
+
+  size(4);
+  format %{ "orr_32  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+// TODO: orn_32 with limmIn
+
+// Register Or Long
+instruct orL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (OrL src1 src2));
+
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "OR     $dst.lo,$src1.lo,$src2.lo\t! long\n\t"
+            "OR     $dst.hi,$src1.hi,$src2.hi" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+    __ orr($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct orL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (OrL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "OR     $dst.lo,$src1.lo,$con\t! long\n\t"
+            "OR     $dst.hi,$src1.hi,$con" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $con$$constant);
+    __ orr($dst$$Register->successor(), $src1$$Register->successor(), 0u);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+#ifdef TODO
+// Use SPRegP to match Rthread (TLS register) without spilling.
+// Use store_ptr_RegP to match Rthread (TLS register) without spilling.
+// Use sp_ptr_RegP to match Rthread (TLS register) without spilling.
+instruct orI_reg_castP2X(iRegI dst, iRegI src1, sp_ptr_RegP src2) %{
+  match(Set dst (OrI src1 (CastP2X src2)));
+  size(4);
+  format %{ "OR     $dst,$src1,$src2" %}
+  ins_encode %{
+    __ orr($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+#endif
+
+// Xor Instructions
+// Register Xor
+instruct xorI_reg_reg(iRegI dst, iRegI src1, iRegI src2) %{
+  match(Set dst (XorI src1 src2));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorshlI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorshlI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (LShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2<<$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, lsl($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorsarI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorsarI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (RShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, asr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorshrI_reg_reg_reg(iRegI dst, iRegI src1, iRegI src2, iRegI src3) %{
+  match(Set dst (XorI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "XOR    $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct xorshrI_reg_reg_imm(iRegI dst, iRegI src1, iRegI src2, immU5 src3) %{
+  match(Set dst (XorI src1 (URShiftI src2 src3)));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2>>>$src3" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register, lsr($src3$$constant));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Immediate Xor
+instruct xorI_reg_imm(iRegI dst, iRegI src1, limmI src2) %{
+  match(Set dst (XorI src1 src2));
+
+  size(4);
+  format %{ "eor_32 $dst,$src1,$src2" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$constant);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+// Register Xor Long
+instruct xorL_reg_reg(iRegL dst, iRegL src1, iRegL src2) %{
+  match(Set dst (XorL src1 src2));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "XOR     $dst.hi,$src1.hi,$src2.hi\t! long\n\t"
+            "XOR     $dst.lo,$src1.lo,$src2.lo\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $src2$$Register);
+    __ eor($dst$$Register->successor(), $src1$$Register->successor(), $src2$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct xorL_reg_immRot(iRegL dst, iRegL src1, immLlowRot con) %{
+  match(Set dst (XorL src1 con));
+  ins_cost(DEFAULT_COST);
+  size(8);
+  format %{ "XOR     $dst.hi,$src1.hi,$con\t! long\n\t"
+            "XOR     $dst.lo,$src1.lo,0\t! long" %}
+  ins_encode %{
+    __ eor($dst$$Register, $src1$$Register, $con$$constant);
+    __ eor($dst$$Register->successor(), $src1$$Register->successor(), 0u);
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+//----------Convert to Boolean-------------------------------------------------
+instruct convI2B( iRegI dst, iRegI src, flagsReg ccr ) %{
+  match(Set dst (Conv2B src));
+  effect(KILL ccr);
+  size(12);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "TST    $src,$src \n\t"
+            "MOV    $dst, 0   \n\t"
+            "MOV.ne $dst, 1" %}
+  ins_encode %{ // FIXME: can do better?
+    __ tst($src$$Register, $src$$Register);
+    __ mov($dst$$Register, 0);
+    __ mov($dst$$Register, 1, Assembler::NE);
+  %}
+  ins_pipe(ialu_reg_ialu);
+%}
+
+instruct convP2B( iRegI dst, iRegP src, flagsReg ccr ) %{
+  match(Set dst (Conv2B src));
+  effect(KILL ccr);
+  size(12);
+  ins_cost(DEFAULT_COST*2);
+  format %{ "TST    $src,$src \n\t"
+            "MOV    $dst, 0   \n\t"
+            "MOV.ne $dst, 1" %}
+  ins_encode %{
+    __ tst($src$$Register, $src$$Register);
+    __ mov($dst$$Register, 0);
+    __ mov($dst$$Register, 1, Assembler::NE);
+  %}
+  ins_pipe(ialu_reg_ialu);
+%}
+
+instruct cmpLTMask_reg_reg( iRegI dst, iRegI p, iRegI q, flagsReg ccr ) %{
+  match(Set dst (CmpLTMask p q));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP    $p,$q\n\t"
+            "MOV    $dst, #0\n\t"
+            "MOV.lt $dst, #-1" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$Register);
+    __ mov_i($dst$$Register, 0);
+    __ mvn_i($dst$$Register, 0, Assembler::LT);
+  %}
+  ins_pipe(ialu_reg_reg_ialu);
+%}
+
+instruct cmpLTMask_reg_imm( iRegI dst, iRegI p, aimmI q, flagsReg ccr ) %{
+  match(Set dst (CmpLTMask p q));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*3);
+  format %{ "CMP    $p,$q\n\t"
+            "MOV    $dst, #0\n\t"
+            "MOV.lt $dst, #-1" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$constant);
+    __ mov_i($dst$$Register, 0);
+    __ mvn_i($dst$$Register, 0, Assembler::LT);
+  %}
+  ins_pipe(ialu_reg_reg_ialu);
+%}
+
+instruct cadd_cmpLTMask3( iRegI p, iRegI q, iRegI y, iRegI z, flagsReg ccr ) %{
+  match(Set z (AddI (AndI (CmpLTMask p q) y) z));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP    $p,$q\n\t"
+            "ADD.lt $z,$y,$z" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$Register);
+    __ add($z$$Register, $y$$Register, $z$$Register, Assembler::LT);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+
+// FIXME: remove unused "dst"
+instruct cadd_cmpLTMask4( iRegI dst, iRegI p, aimmI q, iRegI y, iRegI z, flagsReg ccr ) %{
+  match(Set z (AddI (AndI (CmpLTMask p q) y) z));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "CMP    $p,$q\n\t"
+            "ADD.lt $z,$y,$z" %}
+  ins_encode %{
+    __ cmp($p$$Register, $q$$constant);
+    __ add($z$$Register, $y$$Register, $z$$Register, Assembler::LT);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+
+instruct cadd_cmpLTMask( iRegI p, iRegI q, iRegI y, flagsReg ccr ) %{
+  match(Set p (AddI (AndI (CmpLTMask p q) y) (SubI p q)));
+  effect( KILL ccr );
+  ins_cost(DEFAULT_COST*2);
+  format %{ "SUBS   $p,$p,$q\n\t"
+            "ADD.lt $p,$y,$p" %}
+  ins_encode %{
+    __ subs($p$$Register, $p$$Register, $q$$Register);
+    __ add($p$$Register, $y$$Register, $p$$Register, Assembler::LT);
+  %}
+  ins_pipe( cadd_cmpltmask );
+%}
+
+//----------Arithmetic Conversion Instructions---------------------------------
+// The conversions operations are all Alpha sorted.  Please keep it that way!
+
+instruct convD2F_reg(regF dst, regD src) %{
+  match(Set dst (ConvD2F src));
+  size(4);
+  format %{ "FCVTSD  $dst,$src" %}
+  ins_encode %{
+    __ vcvt_f32_f64($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2F);
+%}
+
+// Convert a double to an int in a float register.
+// If the double is a NAN, stuff a zero in instead.
+
+instruct convD2I_reg_reg(iRegI dst, regD src, regF tmp) %{
+  match(Set dst (ConvD2I src));
+  effect( TEMP tmp );
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  format %{ "FTOSIZD  $tmp,$src\n\t"
+            "FMRS     $dst, $tmp" %}
+  ins_encode %{
+    __ vcvt_s32_f64($tmp$$FloatRegister, $src$$FloatRegister);
+    __ vmov_f32($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(fcvtD2I);
+%}
+
+// Convert a double to a long in a double register.
+// If the double is a NAN, stuff a zero in instead.
+
+// Double to Long conversion
+instruct convD2L_reg(R0R1RegL dst, regD src) %{
+  match(Set dst (ConvD2L src));
+  effect(CALL);
+  ins_cost(MEMORY_REF_COST); // FIXME
+  format %{ "convD2L    $dst,$src\t ! call to SharedRuntime::d2l" %}
+  ins_encode %{
+#ifndef HARD_FLOAT_CC
+    __ vmov_f64($dst$$Register, $dst$$Register->successor(), $src$$FloatRegister);
+#else
+    if ($src$$FloatRegister != d0) {
+      __ vmov_f64(d0, $src$$FloatRegister);
+    }
+#endif
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::d2l);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(fcvtD2L);
+%}
+
+instruct convF2D_reg(regD dst, regF src) %{
+  match(Set dst (ConvF2D src));
+  size(4);
+  format %{ "FCVTDS  $dst,$src" %}
+  ins_encode %{
+    __ vcvt_f64_f32($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2D);
+%}
+
+instruct convF2I_reg_reg(iRegI dst, regF src, regF tmp) %{
+  match(Set dst (ConvF2I src));
+  effect( TEMP tmp );
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  size(8);
+  format %{ "FTOSIZS  $tmp,$src\n\t"
+            "FMRS     $dst, $tmp" %}
+  ins_encode %{
+    __ vcvt_s32_f32($tmp$$FloatRegister, $src$$FloatRegister);
+    __ vmov_f32($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(fcvtF2I);
+%}
+
+// Float to Long conversion
+instruct convF2L_reg(R0R1RegL dst, regF src, R0RegI arg1) %{
+  match(Set dst (ConvF2L src));
+  ins_cost(DEFAULT_COST*2 + MEMORY_REF_COST*2 + BRANCH_COST); // FIXME
+  effect(CALL);
+  format %{ "convF2L  $dst,$src\t! call to SharedRuntime::f2l" %}
+  ins_encode %{
+#ifndef HARD_FLOAT_CC
+    __ vmov_f32($arg1$$Register, $src$$FloatRegister);
+#else
+    if($src$$FloatRegister != f0) {
+      __ vmov_f32(f0, $src$$FloatRegister);
+    }
+#endif
+    address target = CAST_FROM_FN_PTR(address, SharedRuntime::f2l);
+    __ call(target, relocInfo::runtime_call_type);
+  %}
+  ins_pipe(fcvtF2L);
+%}
+
+instruct convI2D_reg_reg(iRegI src, regD_low dst) %{
+  match(Set dst (ConvI2D src));
+  ins_cost(DEFAULT_COST + MEMORY_REF_COST); // FIXME
+  size(8);
+  format %{ "FMSR     $dst,$src \n\t"
+            "FSITOD   $dst $dst"%}
+  ins_encode %{
+      __ vmov_f32($dst$$FloatRegister, $src$$Register);
+      __ vcvt_f64_s32($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+  ins_pipe(fcvtI2D);
+%}
+
+instruct convI2F_reg_reg( regF dst, iRegI src ) %{
+  match(Set dst (ConvI2F src));
+  ins_cost(DEFAULT_COST + MEMORY_REF_COST); // FIXME
+  size(8);
+  format %{ "FMSR     $dst,$src \n\t"
+            "FSITOS   $dst, $dst"%}
+  ins_encode %{
+      __ vmov_f32($dst$$FloatRegister, $src$$Register);
+      __ vcvt_f32_s32($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+  ins_pipe(fcvtI2F);
+%}
+
+instruct convI2L_reg(iRegL dst, iRegI src) %{
+  match(Set dst (ConvI2L src));
+  size(8);
+  format %{ "MOV    $dst.lo, $src \n\t"
+            "ASR    $dst.hi,$src,31\t! int->long" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), $src$$Register, asr(31));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Zero-extend convert int to long
+instruct convI2L_reg_zex(iRegL dst, iRegI src, immL_32bits mask ) %{
+  match(Set dst (AndL (ConvI2L src) mask) );
+  size(8);
+  format %{ "MOV    $dst.lo,$src.lo\t! zero-extend int to long\n\t"
+            "MOV    $dst.hi, 0"%}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// Zero-extend long
+instruct zerox_long(iRegL dst, iRegL src, immL_32bits mask ) %{
+  match(Set dst (AndL src mask) );
+  size(8);
+  format %{ "MOV    $dst.lo,$src.lo\t! zero-extend long\n\t"
+            "MOV    $dst.hi, 0"%}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+    __ mov($dst$$Register->successor(), 0);
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+instruct MoveF2I_reg_reg(iRegI dst, regF src) %{
+  match(Set dst (MoveF2I src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMRS   $dst,$src\t! MoveF2I" %}
+  ins_encode %{
+    __ vmov_f32($dst$$Register, $src$$FloatRegister);
+  %}
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveI2F_reg_reg(regF dst, iRegI src) %{
+  match(Set dst (MoveI2F src));
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMSR   $dst,$src\t! MoveI2F" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveD2L_reg_reg(iRegL dst, regD src) %{
+  match(Set dst (MoveD2L src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMRRD    $dst,$src\t! MoveD2L" %}
+  ins_encode %{
+    __ vmov_f64($dst$$Register, $dst$$Register->successor(), $src$$FloatRegister);
+  %}
+  ins_pipe(iload_mem); // FIXME
+%}
+
+instruct MoveL2D_reg_reg(regD dst, iRegL src) %{
+  match(Set dst (MoveL2D src));
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST); // FIXME
+
+  size(4);
+  format %{ "FMDRR   $dst,$src\t! MoveL2D" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+//-----------
+// Long to Double conversion
+
+// Magic constant, 0x43300000
+instruct loadConI_x43300000(iRegI dst) %{
+  effect(DEF dst);
+  size(8);
+  format %{ "MOV_SLOW  $dst,0x43300000\t! 2^52" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0x43300000);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+// Magic constant, 0x41f00000
+instruct loadConI_x41f00000(iRegI dst) %{
+  effect(DEF dst);
+  size(8);
+  format %{ "MOV_SLOW  $dst, 0x41f00000\t! 2^32" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0x41f00000);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+instruct loadConI_x0(iRegI dst) %{
+  effect(DEF dst);
+  size(4);
+  format %{ "MOV  $dst, 0x0\t! 0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0);
+  %}
+  ins_pipe(ialu_none);
+%}
+
+// Construct a double from two float halves
+instruct regDHi_regDLo_to_regD(regD_low dst, regD_low src1, regD_low src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(8);
+  format %{ "FCPYS  $dst.hi,$src1.hi\n\t"
+            "FCPYS  $dst.lo,$src2.lo" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister->successor(FloatRegisterImpl::SINGLE), $src1$$FloatRegister->successor(FloatRegisterImpl::SINGLE));
+    __ vmov_f32($dst$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+// Convert integer in high half of a double register (in the lower half of
+// the double register file) to double
+instruct convI2D_regDHi_regD(regD dst, regD_low src) %{
+  effect(DEF dst, USE src);
+  size(4);
+  format %{ "FSITOD  $dst,$src" %}
+  ins_encode %{
+    __ vcvt_f64_s32($dst$$FloatRegister, $src$$FloatRegister->successor(FloatRegisterImpl::SINGLE));// TODO verify the samentics is the same as was before
+  %}
+  ins_pipe(fcvtLHi2D);
+%}
+
+// Add float double precision
+instruct addD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FADDD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vadd_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+// Sub float double precision
+instruct subD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FSUBD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vsub_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(faddD_reg_reg);
+%}
+
+// Mul float double precision
+instruct mulD_regD_regD(regD dst, regD src1, regD src2) %{
+  effect(DEF dst, USE src1, USE src2);
+  size(4);
+  format %{ "FMULD  $dst,$src1,$src2" %}
+  ins_encode %{
+    __ vmul_f64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe(fmulD_reg_reg);
+%}
+
+instruct regL_to_regD(regD dst, iRegL src) %{
+  // No match rule to avoid chain rule match.
+  effect(DEF dst, USE src);
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FMDRR   $dst,$src\t! regL to regD" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+instruct regI_regI_to_regD(regD dst, iRegI src1, iRegI src2) %{
+  // No match rule to avoid chain rule match.
+  effect(DEF dst, USE src1, USE src2);
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FMDRR   $dst,$src1,$src2\t! regI,regI to regD" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src1$$Register, $src2$$Register);
+  %}
+  ins_pipe(ialu_reg_reg); // FIXME
+%}
+
+instruct convL2D_reg_slow_fxtof(regD dst, iRegL src) %{
+  match(Set dst (ConvL2D src));
+  ins_cost(DEFAULT_COST*8 + MEMORY_REF_COST*6); // FIXME
+
+  expand %{
+    regD_low   tmpsrc;
+    iRegI      ix43300000;
+    iRegI      ix41f00000;
+    iRegI      ix0;
+    regD_low   dx43300000;
+    regD       dx41f00000;
+    regD       tmp1;
+    regD_low   tmp2;
+    regD       tmp3;
+    regD       tmp4;
+
+    regL_to_regD(tmpsrc, src);
+
+    loadConI_x43300000(ix43300000);
+    loadConI_x41f00000(ix41f00000);
+    loadConI_x0(ix0);
+
+    regI_regI_to_regD(dx43300000, ix0, ix43300000);
+    regI_regI_to_regD(dx41f00000, ix0, ix41f00000);
+
+    convI2D_regDHi_regD(tmp1, tmpsrc);
+    regDHi_regDLo_to_regD(tmp2, dx43300000, tmpsrc);
+    subD_regD_regD(tmp3, tmp2, dx43300000);
+    mulD_regD_regD(tmp4, tmp1, dx41f00000);
+    addD_regD_regD(dst, tmp3, tmp4);
+  %}
+%}
+
+instruct convL2I_reg(iRegI dst, iRegL src) %{
+  match(Set dst (ConvL2I src));
+  size(4);
+  format %{ "MOV    $dst,$src.lo\t! long->int" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_move_reg_I_to_L);
+%}
+
+// Register Shift Right Immediate
+instruct shrL_reg_imm6_L2I(iRegI dst, iRegL src, immI_32_63 cnt) %{
+  match(Set dst (ConvL2I (RShiftL src cnt)));
+  size(4);
+  format %{ "ASR    $dst,$src.hi,($cnt - 32)\t! long->int or mov if $cnt==32" %}
+  ins_encode %{
+    if ($cnt$$constant == 32) {
+      __ mov($dst$$Register, $src$$Register->successor());
+    } else {
+      __ mov($dst$$Register, $src$$Register->successor(), asr($cnt$$constant - 32));
+    }
+  %}
+  ins_pipe(ialu_reg_imm);
+%}
+
+
+//----------Control Flow Instructions------------------------------------------
+// Compare Instructions
+// Compare Integers
+instruct compI_iReg(flagsReg icc, iRegI op1, iRegI op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1, USE op2 );
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! int" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compU_iReg(flagsRegU icc, iRegI op1, iRegI op2) %{
+  match(Set icc (CmpU op1 op2));
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! unsigned int" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compI_iReg_immneg(flagsReg icc, iRegI op1, aimmIneg op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1 );
+
+  size(4);
+  format %{ "cmn_32 $op1,-$op2\t! int" %}
+  ins_encode %{
+    __ cmn($op1$$Register, -$op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+instruct compI_iReg_imm(flagsReg icc, iRegI op1, aimmI op2) %{
+  match(Set icc (CmpI op1 op2));
+  effect( DEF icc, USE op1 );
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! int" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+instruct testI_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 op2) zero));
+  size(4);
+  format %{ "tst $op2,$op1" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testshlI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (LShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, lsl($op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testshlI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (LShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, lsl($op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testsarI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (RShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, asr($op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testsarI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (RShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, asr($op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testshrI_reg_reg_reg( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, iRegI op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (URShiftI op2 op3)) zero));
+  size(4);
+  format %{ "TST   $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, lsr($op3$$Register));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testshrI_reg_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, iRegI op2, immU5 op3, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 (URShiftI op2 op3)) zero));
+  size(4);
+  format %{ "tst $op2,$op1<<$op3" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$Register, lsr($op3$$constant));
+  %}
+  ins_pipe(ialu_cconly_reg_reg_zero);
+%}
+
+instruct testI_reg_imm( flagsReg_EQNELTGE icc, iRegI op1, limmI op2, immI0 zero ) %{
+  match(Set icc (CmpI (AndI op1 op2) zero));
+  size(4);
+  format %{ "tst $op2,$op1" %}
+
+  ins_encode %{
+    __ tst($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm_zero);
+%}
+
+instruct compL_reg_reg_LTGE(flagsRegL_LTGE xcc, iRegL op1, iRegL op2, iRegI tmp) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2, TEMP tmp );
+
+  size(8);
+  format %{ "CMP     $op1.low,$op2.low\t\t! long\n\t"
+            "SBCS    $tmp,$op1.hi,$op2.hi" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+    __ sbcs($tmp$$Register, $op1$$Register->successor(), $op2$$Register->successor());
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compL_reg_reg_EQNE(flagsRegL_EQNE xcc, iRegL op1, iRegL op2) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2 );
+
+  size(8);
+  format %{ "TEQ    $op1.hi,$op2.hi\t\t! long\n\t"
+            "TEQ.eq $op1.lo,$op2.lo" %}
+  ins_encode %{
+    __ teq($op1$$Register->successor(), $op2$$Register->successor());
+    __ teq($op1$$Register, $op2$$Register, Assembler::EQ);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compL_reg_reg_LEGT(flagsRegL_LEGT xcc, iRegL op1, iRegL op2, iRegI tmp) %{
+  match(Set xcc (CmpL op1 op2));
+  effect( DEF xcc, USE op1, USE op2, TEMP tmp );
+
+  size(8);
+  format %{ "CMP     $op2.low,$op1.low\t\t! long\n\t"
+            "SBCS    $tmp,$op2.hi,$op1.hi" %}
+  ins_encode %{
+    __ cmp($op2$$Register, $op1$$Register);
+    __ sbcs($tmp$$Register, $op2$$Register->successor(), $op1$$Register->successor());
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compUL_reg_reg(flagsRegUL xcc, iRegL op1, iRegL op2) %{
+  match(Set xcc (CmpUL op1 op2));
+  effect( DEF xcc, USE op1, USE op2 );
+
+  size(8);
+  format %{ "CMP     $op1.hi,$op2.hi\t\t! long\n\t"
+            "CMP.eq  $op1.low,$op2.low" %}
+  ins_encode %{
+    __ cmp($op1$$Register->successor(), $op2$$Register->successor());
+    __ cmp($op1$$Register, $op2$$Register, Assembler::EQ);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_LTGE(flagsRegL_LTGE xcc, iRegL op1, immLlowRot con, iRegI tmp) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con, TEMP tmp );
+
+  size(8);
+  format %{ "CMP     $op1.low,$con\t\t! long\n\t"
+            "SBCS    $tmp,$op1.hi,0" %}
+  ins_encode %{
+    __ cmp($op1$$Register, (int)$con$$constant);
+    __ sbcs($tmp$$Register, $op1$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compUL_reg_con(flagsRegUL xcc, iRegL op1, immLlowRot con ) %{
+  match(Set xcc (CmpUL op1 con));
+  effect( DEF xcc, USE op1, USE con );
+
+  size(8);
+  format %{ "CMP     $op1.hi,0\t\t! long\n\t"
+            "CMP.eq  $op1.low,$con" %}
+  ins_encode %{
+    __ cmp($op1$$Register->successor(), 0);
+    __ cmp($op1$$Register, (int)$con$$constant, Assembler::EQ);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_EQNE(flagsRegL_EQNE xcc, iRegL op1, immLlowRot con) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con );
+
+  size(8);
+  format %{ "TEQ    $op1.hi,0\t\t! long\n\t"
+            "TEQ.eq $op1.lo,$con" %}
+  ins_encode %{
+    __ teq($op1$$Register->successor(), 0);
+    __ teq($op1$$Register, (int)$con$$constant, Assembler::EQ);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+// TODO: try immLRot2 instead, (0, $con$$constant) becomes
+// (hi($con$$constant), lo($con$$constant)) becomes
+instruct compL_reg_con_LEGT(flagsRegL_LEGT xcc, iRegL op1, immLlowRot con, iRegL tmp) %{
+  match(Set xcc (CmpL op1 con));
+  effect( DEF xcc, USE op1, USE con, TEMP tmp );
+
+  size(8);
+  format %{ "RSBS    $tmp,$op1.low,$con\t\t! long\n\t"
+            "RSCS    $tmp,$op1.hi,0" %}
+  ins_encode %{
+    __ rsbs($tmp$$Register, $op1$$Register, (long)$con$$constant);
+    __ rscs($tmp$$Register->successor(), $op1$$Register->successor(), 0);
+  %}
+
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+/* instruct testL_reg_reg(flagsRegL xcc, iRegL op1, iRegL op2, immL0 zero) %{ */
+/*   match(Set xcc (CmpL (AndL op1 op2) zero)); */
+/*   ins_encode %{ */
+/*     __ stop("testL_reg_reg unimplemented"); */
+/*   %} */
+/*   ins_pipe(ialu_cconly_reg_reg); */
+/* %} */
+
+/* // useful for checking the alignment of a pointer: */
+/* instruct testL_reg_con(flagsRegL xcc, iRegL op1, immLlowRot con, immL0 zero) %{ */
+/*   match(Set xcc (CmpL (AndL op1 con) zero)); */
+/*   ins_encode %{ */
+/*     __ stop("testL_reg_con unimplemented"); */
+/*   %} */
+/*   ins_pipe(ialu_cconly_reg_reg); */
+/* %} */
+
+instruct compU_iReg_imm(flagsRegU icc, iRegI op1, aimmU31 op2 ) %{
+  match(Set icc (CmpU op1 op2));
+
+  size(4);
+  format %{ "cmp_32 $op1,$op2\t! unsigned" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+// Compare Pointers
+instruct compP_iRegP(flagsRegP pcc, iRegP op1, iRegP op2 ) %{
+  match(Set pcc (CmpP op1 op2));
+
+  size(4);
+  format %{ "CMP    $op1,$op2\t! ptr" %}
+  ins_encode %{
+    __ cmp($op1$$Register, $op2$$Register);
+  %}
+  ins_pipe(ialu_cconly_reg_reg);
+%}
+
+instruct compP_iRegP_imm(flagsRegP pcc, iRegP op1, aimmP op2 ) %{
+  match(Set pcc (CmpP op1 op2));
+
+  size(4);
+  format %{ "CMP    $op1,$op2\t! ptr" %}
+  ins_encode %{
+    assert($op2$$constant == 0 || _opnds[2]->constant_reloc() == relocInfo::none, "reloc in cmp?");
+    __ cmp($op1$$Register, $op2$$constant);
+  %}
+  ins_pipe(ialu_cconly_reg_imm);
+%}
+
+//----------Max and Min--------------------------------------------------------
+// Min Instructions
+// Conditional move for min
+instruct cmovI_reg_lt( iRegI op2, iRegI op1, flagsReg icc ) %{
+  effect( USE_DEF op2, USE op1, USE icc );
+
+  size(4);
+  format %{ "MOV.lt  $op2,$op1\t! min" %}
+  ins_encode %{
+    __ mov($op2$$Register, $op1$$Register, Assembler::LT);
+  %}
+  ins_pipe(ialu_reg_flags);
+%}
+
+// Min Register with Register.
+instruct minI_eReg(iRegI op1, iRegI op2) %{
+  match(Set op2 (MinI op1 op2));
+  ins_cost(DEFAULT_COST*2);
+  expand %{
+    flagsReg icc;
+    compI_iReg(icc,op1,op2);
+    cmovI_reg_lt(op2,op1,icc);
+  %}
+%}
+
+// Max Instructions
+// Conditional move for max
+instruct cmovI_reg_gt( iRegI op2, iRegI op1, flagsReg icc ) %{
+  effect( USE_DEF op2, USE op1, USE icc );
+  format %{ "MOV.gt  $op2,$op1\t! max" %}
+  ins_encode %{
+    __ mov($op2$$Register, $op1$$Register, Assembler::GT);
+  %}
+  ins_pipe(ialu_reg_flags);
+%}
+
+// Max Register with Register
+instruct maxI_eReg(iRegI op1, iRegI op2) %{
+  match(Set op2 (MaxI op1 op2));
+  ins_cost(DEFAULT_COST*2);
+  expand %{
+    flagsReg icc;
+    compI_iReg(icc,op1,op2);
+    cmovI_reg_gt(op2,op1,icc);
+  %}
+%}
+
+
+//----------Float Compares----------------------------------------------------
+// Compare floating, generate condition code
+instruct cmpF_cc(flagsRegF fcc, flagsReg icc, regF src1, regF src2) %{
+  match(Set icc (CmpF src1 src2));
+  effect(KILL fcc);
+
+  size(8);
+  format %{ "FCMPs  $src1,$src2\n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ vcmp_f32($src1$$FloatRegister, $src2$$FloatRegister);
+    __ get_fpsr();
+  %}
+  ins_pipe(faddF_fcc_reg_reg_zero);
+%}
+
+instruct cmpF0_cc(flagsRegF fcc, flagsReg icc, regF src1, immF0 src2) %{
+  match(Set icc (CmpF src1 src2));
+  effect(KILL fcc);
+
+  size(8);
+  format %{ "FCMPs  $src1,$src2\n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ vcmp_f32($src1$$FloatRegister, 0);
+    __ get_fpsr();
+  %}
+  ins_pipe(faddF_fcc_reg_reg_zero);
+%}
+
+instruct cmpD_cc(flagsRegF fcc, flagsReg icc, regD src1, regD src2) %{
+  match(Set icc (CmpD src1 src2));
+  effect(KILL fcc);
+
+  size(8);
+  format %{ "FCMPd  $src1,$src2 \n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ vcmp_f64($src1$$FloatRegister, $src2$$FloatRegister);
+    __ get_fpsr();
+  %}
+  ins_pipe(faddD_fcc_reg_reg_zero);
+%}
+
+instruct cmpD0_cc(flagsRegF fcc, flagsReg icc, regD src1, immD0 src2) %{
+  match(Set icc (CmpD src1 src2));
+  effect(KILL fcc);
+
+  size(8);
+  format %{ "FCMPZd  $src1,$src2 \n\t"
+            "FMSTAT" %}
+  ins_encode %{
+    __ vcmp_f64($src1$$FloatRegister, 0);
+    __ get_fpsr();
+  %}
+  ins_pipe(faddD_fcc_reg_reg_zero);
+%}
+
+// Compare floating, generate -1,0,1
+instruct cmpF_reg(iRegI dst, regF src1, regF src2, flagsRegF fcc) %{
+  match(Set dst (CmpF3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPs  $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ vcmp_f32($src1$$FloatRegister, $src2$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpF0_reg(iRegI dst, regF src1, immF0 src2, flagsRegF fcc) %{
+  match(Set dst (CmpF3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPZs $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ vcmp_f32($src1$$FloatRegister, 0);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpD_reg(iRegI dst, regD src1, regD src2, flagsRegF fcc) %{
+  match(Set dst (CmpD3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPd  $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ vcmp_f64($src1$$FloatRegister, $src2$$FloatRegister);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+instruct cmpD0_reg(iRegI dst, regD src1, immD0 src2, flagsRegF fcc) %{
+  match(Set dst (CmpD3 src1 src2));
+  effect(KILL fcc);
+  ins_cost(DEFAULT_COST*3+BRANCH_COST*3); // FIXME
+  size(20);
+  // same number of instructions as code using conditional moves but
+  // doesn't kill integer condition register
+  format %{ "FCMPZd $dst,$src1,$src2 \n\t"
+            "VMRS   $dst, FPSCR \n\t"
+            "OR     $dst, $dst, 0x08000000 \n\t"
+            "EOR    $dst, $dst, $dst << 3 \n\t"
+            "MOV    $dst, $dst >> 30" %}
+  ins_encode %{
+    __ vcmp_f64($src1$$FloatRegister, 0);
+    __ floating_cmp($dst$$Register);
+  %}
+  ins_pipe( floating_cmp );
+%}
+
+//----------Branches---------------------------------------------------------
+// Jump
+// (compare 'operand indIndex' and 'instruct addP_reg_reg' above)
+// FIXME
+instruct jumpXtnd(iRegX switch_val, iRegP tmp) %{
+  match(Jump switch_val);
+  effect(TEMP tmp);
+  ins_cost(350);
+  format %{  "ADD    $tmp, $constanttablebase, $switch_val\n\t"
+             "LDR    $tmp,[$tmp + $constantoffset]\n\t"
+             "BX     $tmp" %}
+  size(20);
+  ins_encode %{
+    Register table_reg;
+    Register label_reg = $tmp$$Register;
+    if (constant_offset() == 0) {
+      table_reg = $constanttablebase;
+      __ ldr(label_reg, Address(table_reg, $switch_val$$Register));
+    } else {
+      table_reg = $tmp$$Register;
+      int offset = $constantoffset;
+      if (is_memoryP(offset)) {
+        __ add(table_reg, $constanttablebase, $switch_val$$Register);
+        __ ldr(label_reg, Address(table_reg, offset));
+      } else {
+        __ mov(table_reg, $constantoffset);
+        __ add(table_reg, $constanttablebase, table_reg);
+        __ ldr(label_reg, Address(table_reg, $switch_val$$Register));
+      }
+    }
+    __ b(label_reg); // ldr + b better than ldr to PC for branch predictor?
+    //    __ ldr(PC, Address($table$$Register, $switch_val$$Register));
+  %}
+  ins_pipe(ialu_reg_reg);
+%}
+
+// // Direct Branch.
+instruct branch(label labl) %{
+  match(Goto);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B     $labl" %}
+  ins_encode %{
+    __ b(*($labl$$label));
+  %}
+  ins_pipe(br);
+%}
+
+// Conditional Direct Branch
+instruct branchCon(cmpOp cmp, flagsReg icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchCon_EQNELTGE(cmpOp0 cmp, flagsReg_EQNELTGE icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConU(cmpOpU cmp, flagsRegU icc, label labl) %{
+  match(If cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $icc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConP(cmpOpP cmp, flagsRegP pcc, label labl) %{
+  match(If cmp pcc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $pcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConL_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConL_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConL_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+  predicate( _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt || _kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le );
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchConUL(cmpOpU cmp, flagsRegUL xcc, label labl) %{
+  match(If cmp xcc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp  $xcc,$labl" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+instruct branchLoopEnd(cmpOp cmp, flagsReg icc, label labl) %{
+  match(CountedLoopEnd cmp icc);
+  effect(USE labl);
+
+  size(4);
+  ins_cost(BRANCH_COST);
+  format %{ "B$cmp   $icc,$labl\t! Loop end" %}
+  ins_encode %{
+    __ b(*($labl$$label), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(br_cc);
+%}
+
+// instruct branchLoopEndU(cmpOpU cmp, flagsRegU icc, label labl) %{
+//   match(CountedLoopEnd cmp icc);
+//   ins_pipe(br_cc);
+// %}
+
+// ============================================================================
+// Long Compare
+//
+// Currently we hold longs in 2 registers.  Comparing such values efficiently
+// is tricky.  The flavor of compare used depends on whether we are testing
+// for LT, LE, or EQ.  For a simple LT test we can check just the sign bit.
+// The GE test is the negated LT test.  The LE test can be had by commuting
+// the operands (yielding a GE test) and then negating; negate again for the
+// GT test.  The EQ test is done by ORcc'ing the high and low halves, and the
+// NE test is negated from that.
+
+// Due to a shortcoming in the ADLC, it mixes up expressions like:
+// (foo (CmpI (CmpL X Y) 0)) and (bar (CmpI (CmpL X 0L) 0)).  Note the
+// difference between 'Y' and '0L'.  The tree-matches for the CmpI sections
+// are collapsed internally in the ADLC's dfa-gen code.  The match for
+// (CmpI (CmpL X Y) 0) is silently replaced with (CmpI (CmpL X 0L) 0) and the
+// foo match ends up with the wrong leaf.  One fix is to not match both
+// reg-reg and reg-zero forms of long-compare.  This is unfortunate because
+// both forms beat the trinary form of long-compare and both are very useful
+// on Intel which has so few registers.
+
+// instruct branchCon_long(cmpOp cmp, flagsRegL xcc, label labl) %{
+//   match(If cmp xcc);
+//   ins_pipe(br_cc);
+// %}
+
+// Manifest a CmpL3 result in an integer register.  Very painful.
+// This is the test to avoid.
+instruct cmpL3_reg_reg(iRegI dst, iRegL src1, iRegL src2, flagsReg ccr ) %{
+  match(Set dst (CmpL3 src1 src2) );
+  effect( KILL ccr );
+  ins_cost(6*DEFAULT_COST); // FIXME
+  size(32);
+  format %{
+      "CMP    $src1.hi, $src2.hi\t\t! long\n"
+    "\tMOV.gt $dst, 1\n"
+    "\tmvn.lt $dst, 0\n"
+    "\tB.ne   done\n"
+    "\tSUBS   $dst, $src1.lo, $src2.lo\n"
+    "\tMOV.hi $dst, 1\n"
+    "\tmvn.lo $dst, 0\n"
+    "done:"     %}
+  ins_encode %{
+    Label done;
+    __ cmp($src1$$Register->successor(), $src2$$Register->successor());
+    __ mov_i($dst$$Register, 1, Assembler::GT);
+    __ mvn_i($dst$$Register, 0, Assembler::LT);
+    __ b(done, Assembler::NE);
+    __ subs($dst$$Register, $src1$$Register, $src2$$Register);
+    __ mov_i($dst$$Register, 1, Assembler::HI);
+    __ mvn_i($dst$$Register, 0, Assembler::LO);
+    __ bind(done);
+  %}
+  ins_pipe(cmpL_reg);
+%}
+
+// Conditional move
+instruct cmovLL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegL dst, iRegL src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,$src.lo\t! long\n\t"
+            "MOV$cmp  $dst,$src.hi" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), $src$$Register->successor(), (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovLL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovLL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegL dst, immL0 src) %{
+  match(Set dst (CMoveL (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+  ins_cost(140);
+  size(8);
+  format %{ "MOV$cmp  $dst.lo,0\t! long\n\t"
+            "MOV$cmp  $dst,0" %}
+  ins_encode %{
+    __ mov($dst$$Register, 0, (Assembler::Condition)($cmp$$cmpcode));
+    __ mov($dst$$Register->successor(), 0, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegI dst, iRegI src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovIL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovIL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegI dst, immI16 src) %{
+  match(Set dst (CMoveI (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegP dst, iRegP src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "MOV$cmp  $dst,$src" %}
+  ins_encode %{
+    __ mov($dst$$Register, $src$$Register, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct cmovPL_imm_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_imm_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovPL_imm_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, iRegP dst, immP0 src) %{
+  match(Set dst (CMoveP (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(140);
+  format %{ "MOVW$cmp  $dst,$src" %}
+  ins_encode %{
+    __ movw_i($dst$$Register, $src$$constant, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(ialu_imm);
+%}
+
+instruct cmovFL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovFL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, regF dst, regF src) %{
+  match(Set dst (CMoveF (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYS$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f32($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_LTGE(cmpOpL cmp, flagsRegL_LTGE xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::lt || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ge );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_EQNE(cmpOpL cmp, flagsRegL_EQNE xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::eq || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::ne );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+instruct cmovDL_reg_LEGT(cmpOpL_commute cmp, flagsRegL_LEGT xcc, regD dst, regD src) %{
+  match(Set dst (CMoveD (Binary cmp xcc) (Binary dst src)));
+  predicate(_kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::le || _kids[0]->_kids[0]->_leaf->as_Bool()->_test._test == BoolTest::gt );
+
+  ins_cost(150);
+  size(4);
+  format %{ "FCPYD$cmp $dst,$src" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$FloatRegister, (Assembler::Condition)($cmp$$cmpcode));
+  %}
+  ins_pipe(int_conditional_float_move);
+%}
+
+// ============================================================================
+// Safepoint Instruction
+// rather than KILL R12, it would be better to use any reg as
+// TEMP. Can't do that at this point because it crashes the compiler
+instruct safePoint_poll(iRegP poll, R12RegI tmp, flagsReg icc) %{
+  match(SafePoint poll);
+  effect(USE poll, KILL tmp, KILL icc);
+
+  size(4);
+  format %{ "LDR   $tmp,[$poll]\t! Safepoint: poll for GC" %}
+  ins_encode %{
+    __ relocate(relocInfo::poll_type);
+    __ ldr($tmp$$Register, Address($poll$$Register));
+  %}
+  ins_pipe(loadPollP);
+%}
+
+
+// ============================================================================
+// Call Instructions
+// Call Java Static Instruction
+instruct CallStaticJavaDirect( method meth ) %{
+  match(CallStaticJava);
+  predicate(! ((CallStaticJavaNode*)n)->is_method_handle_invoke());
+  effect(USE meth);
+  size(call_static_enc_size(this, _method, _method_handle_invoke));
+
+  ins_cost(CALL_COST);
+  format %{ "CALL,static ==> " %}
+  ins_encode( Java_Static_Call( meth ), call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call Java Static Instruction (method handle version)
+instruct CallStaticJavaHandle( method meth ) %{
+  match(CallStaticJava);
+  predicate(((CallStaticJavaNode*)n)->is_method_handle_invoke());
+  effect(USE meth);
+  size(call_static_enc_size(this, _method, _method_handle_invoke));
+
+  // FP is saved by all callees (for interpreter stack correction).
+  // We use it here for a similar purpose, in {preserve,restore}_FP.
+
+  ins_cost(CALL_COST);
+  format %{ "CALL,static/MethodHandle ==> " %}
+  ins_encode( preserve_SP, Java_Static_Call( meth ), restore_SP, call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call Java Dynamic Instruction
+instruct CallDynamicJavaDirect( method meth ) %{
+  match(CallDynamicJava);
+  effect(USE meth);
+  size(call_dynamic_enc_size());
+
+  ins_cost(CALL_COST);
+  format %{ "MOV_OOP    (empty),R_R8\n\t"
+            "CALL,dynamic  ; NOP ==> " %}
+  ins_encode( Java_Dynamic_Call( meth ), call_epilog );
+  ins_pipe(call);
+%}
+
+// Call Runtime Instruction
+instruct CallRuntimeDirect(method meth) %{
+  match(CallRuntime);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  size(call_runtime_enc_size(this));
+
+  format %{ "CALL,runtime" %}
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call runtime without safepoint - same as CallRuntime
+instruct CallLeafDirect(method meth) %{
+  match(CallLeaf);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  size(call_runtime_enc_size(this));
+
+  format %{ "CALL,runtime leaf" %}
+  // TODO: ned save_last_PC here?
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Call runtime without safepoint - same as CallLeaf
+instruct CallLeafNoFPDirect(method meth) %{
+  match(CallLeafNoFP);
+  effect(USE meth);
+  ins_cost(CALL_COST);
+  size(call_runtime_enc_size(this));
+
+  format %{ "CALL,runtime leaf nofp" %}
+  // TODO: ned save_last_PC here?
+  ins_encode( Java_To_Runtime( meth ),
+              call_epilog );
+  ins_pipe(simple_call);
+%}
+
+// Tail Call; Jump from runtime stub to Java code.
+// Also known as an 'interprocedural jump'.
+// Target of jump will eventually return to caller.
+// TailJump below removes the return address.
+instruct TailCalljmpInd(iRegP jump_target, inline_cache_regP method_oop) %{
+  match(TailCall jump_target method_oop );
+
+  ins_cost(CALL_COST);
+  format %{ "MOV    Rexception_pc, LR\n\t"
+            "jump   $jump_target  \t! $method_oop holds method oop" %}
+  ins_encode %{
+    __ mov(r3, lr);   // this is used only to call
+                                 // StubRoutines::forward_exception_entry()
+                                 // which expects PC of exception in
+                                 // R3. FIXME?
+    __ b($jump_target$$Register);
+  %}
+  ins_pipe(tail_call);
+%}
+
+
+// Return Instruction
+instruct Ret() %{
+  match(Return);
+
+  format %{ "ret LR" %}
+
+  ins_encode %{
+    __ ret(lr);
+  %}
+
+  ins_pipe(br);
+%}
+
+
+// Tail Jump; remove the return address; jump to target.
+// TailCall above leaves the return address around.
+// TailJump is used in only one place, the rethrow_Java stub (fancy_jump=2).
+// ex_oop (Exception Oop) is needed in %o0 at the jump. As there would be a
+// "restore" before this instruction (in Epilogue), we need to materialize it
+// in %i0.
+instruct tailjmpInd(IPRegP jump_target, RExceptionRegP ex_oop) %{
+  match( TailJump jump_target ex_oop );
+  ins_cost(CALL_COST);
+  format %{ "MOV    Rexception_pc, LR\n\t"
+            "jump   $jump_target \t! $ex_oop holds exc. oop" %}
+  ins_encode %{
+    __ mov(r3, lr);
+    __ b($jump_target$$Register);
+  %}
+  ins_pipe(tail_call);
+%}
+
+// Create exception oop: created by stack-crawling runtime code.
+// Created exception is now available to this handler, and is setup
+// just prior to jumping to this handler.  No code emitted.
+instruct CreateException( RExceptionRegP ex_oop )
+%{
+  match(Set ex_oop (CreateEx));
+  ins_cost(0);
+
+  size(0);
+  // use the following format syntax
+  format %{ "! exception oop is in Rexception_obj; no code emitted" %}
+  ins_encode();
+  ins_pipe(empty);
+%}
+
+
+// Rethrow exception:
+// The exception oop will come in the first argument position.
+// Then JUMP (not call) to the rethrow stub code.
+instruct RethrowException()
+%{
+  match(Rethrow);
+  ins_cost(CALL_COST);
+
+  // use the following format syntax
+  format %{ "b    rethrow_stub" %}
+  ins_encode %{
+    Register scratch = r1;
+    assert_different_registers(scratch, c_rarg0, lr);
+    __ jump(OptoRuntime::rethrow_stub(), relocInfo::runtime_call_type, scratch);
+  %}
+  ins_pipe(tail_call);
+%}
+
+
+// Die now
+instruct ShouldNotReachHere( )
+%{
+  match(Halt);
+  ins_cost(CALL_COST);
+
+  size(4);
+  // Use the following format syntax
+  format %{ "ShouldNotReachHere" %}
+  ins_encode %{
+    __ udf(0xdead);
+  %}
+  ins_pipe(tail_call);
+%}
+
+// ============================================================================
+// The 2nd slow-half of a subtype check.  Scan the subklass's 2ndary superklass
+// array for an instance of the superklass.  Set a hidden internal cache on a
+// hit (cache is checked with exposed code in gen_subtype_check()).  Return
+// not zero for a miss or zero for a hit.  The encoding ALSO sets flags.
+instruct partialSubtypeCheck( R0RegP index, R1RegP sub, R2RegP super, flagsRegP pcc, LRRegP lr, R9RegI r9, R12RegI r12 ) %{
+  match(Set index (PartialSubtypeCheck sub super));
+  effect( KILL pcc, KILL r9, KILL r12, KILL lr );
+  ins_cost(DEFAULT_COST*10);
+  format %{ "CALL   PartialSubtypeCheck" %}
+  ins_encode %{
+    __ call(StubRoutines::aarch32::partial_subtype_check(), relocInfo::runtime_call_type);
+  %}
+  ins_pipe(partial_subtype_check_pipe);
+%}
+
+/* instruct partialSubtypeCheck_vs_zero( flagsRegP pcc, o1RegP sub, o2RegP super, immP0 zero, o0RegP idx, o7RegP o7 ) %{ */
+/*   match(Set pcc (CmpP (PartialSubtypeCheck sub super) zero)); */
+/*   ins_pipe(partial_subtype_check_pipe); */
+/* %} */
+
+
+// ============================================================================
+// inlined locking and unlocking
+
+instruct cmpFastLock(flagsRegP pcc, iRegP object, iRegP box, iRegP mark, iRegP scratch2, iRegP scratch )
+%{
+  match(Set pcc (FastLock object box));
+
+  effect(TEMP mark, TEMP scratch, TEMP scratch2);
+  ins_cost(100);
+
+  format %{ "FASTLOCK  $object, $box; KILL $mark, $scratch, $scratch2" %}
+  ins_encode %{
+    __ fast_lock($object$$Register, $box$$Register, $mark$$Register, $scratch$$Register, $scratch2$$Register);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+
+instruct cmpFastUnlock(flagsRegP pcc, iRegP object, iRegP box, iRegP scratch2, iRegP scratch ) %{
+  match(Set pcc (FastUnlock object box));
+  effect(TEMP scratch, TEMP scratch2);
+  ins_cost(100);
+
+  format %{ "FASTUNLOCK  $object, $box; KILL $scratch, $scratch2" %}
+  ins_encode %{
+    __ fast_unlock($object$$Register, $box$$Register, $scratch$$Register, $scratch2$$Register);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+// Count and Base registers are fixed because the allocator cannot
+// kill unknown registers.  The encodings are generic.
+instruct clear_array(iRegX cnt, iRegP base, iRegI temp, iRegX zero, Universe dummy, flagsReg cpsr) %{
+  match(Set dummy (ClearArray cnt base));
+  effect(TEMP temp, TEMP zero, KILL cpsr);
+  ins_cost(300);
+  format %{ "MOV    $zero,0\n"
+      "        MOV    $temp,$cnt\n"
+      "loop:   SUBS   $temp,$temp,4\t! Count down a dword of bytes\n"
+      "        STR.ge $zero,[$base+$temp]\t! delay slot"
+      "        B.gt   loop\t\t! Clearing loop\n" %}
+  ins_encode %{
+    __ mov($zero$$Register, 0);
+    __ mov($temp$$Register, $cnt$$Register);
+    Label(loop);
+    __ bind(loop);
+    __ subs($temp$$Register, $temp$$Register, 4);
+    __ str($zero$$Register, Address($base$$Register, $temp$$Register), Assembler::GE);
+    __ b(loop, Assembler::GT);
+  %}
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_compareUU(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                          iRegI tmp1, iRegI tmp2, Q0_regD tmp3, Q1_regD tmp4, flagsReg ccr) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
+  ins_encode( enc_String_Compare(str1, str2, cnt1, cnt2, result, tmp1, tmp2, tmp3, tmp4, (2), (2)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_compareLL(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                          iRegI tmp1, iRegI tmp2, Q0_regD tmp3, Q1_regD tmp4, flagsReg ccr) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
+  ins_encode( enc_String_Compare(str1, str2, cnt1, cnt2, result, tmp1, tmp2, tmp3, tmp4, (1), (1)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_compareUL(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                          iRegI tmp1, iRegI tmp2, Q0_regD tmp3, Q1_regD tmp4, flagsReg ccr) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::UL);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
+  ins_encode( enc_String_Compare(str1, str2, cnt1, cnt2, result, tmp1, tmp2, tmp3, tmp4, (2), (1)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_compareLU(R0RegP str1, R1RegP str2, R2RegI cnt1, R3RegI cnt2, iRegI result,
+                          iRegI tmp1, iRegI tmp2, Q0_regD tmp3, Q1_regD tmp4, flagsReg ccr) %{
+  predicate(((StrCompNode*)n)->encoding() == StrIntrinsicNode::LU);
+  match(Set result (StrComp (Binary str1 cnt1) (Binary str2 cnt2)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, USE_KILL str1, USE_KILL str2, USE_KILL cnt1, USE_KILL cnt2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Compare $str1,$cnt1,$str2,$cnt2 -> $result   # KILL $tmp1, $tmp2, $tmp3, $tmp4" %}
+  ins_encode( enc_String_Compare(str1, str2, cnt1, cnt2, result, tmp1, tmp2, tmp3, tmp4, (1), (2)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_equalsUU(R0RegP str1, R1RegP str2, R2RegI cnt, iRegI result, iRegI tmp1, flagsReg ccr) %{
+  predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP tmp1, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Equals $str1,$str2,$cnt -> $result   # KILL $tmp1" %}
+  ins_encode( enc_Array_Equals(str1, str2, cnt, tmp1, result, (2), (false)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_equalsLL(R0RegP str1, R1RegP str2, R2RegI cnt, iRegI result, iRegI tmp1, flagsReg ccr) %{
+  predicate(((StrEqualsNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (StrEquals (Binary str1 str2) cnt));
+  effect(USE_KILL str1, USE_KILL str2, USE_KILL cnt, TEMP tmp1, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "String Equals $str1,$str2,$cnt -> $result   # KILL $tmp1" %}
+  ins_encode( enc_Array_Equals(str1, str2, cnt, tmp1, result, (1), (false)) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct array_equalsUU(R0RegP ary1, R1RegP ary2, iRegI tmp1, iRegI tmp2, iRegI result, flagsReg ccr) %{
+  predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::UU);
+  match(Set result (AryEq ary1 ary2));
+  effect(USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result   # KILL $tmp1,$tmp2" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result, (2), (true)));
+  ins_pipe(long_memory_op);
+%}
+
+instruct array_equalsLL(R0RegP ary1, R1RegP ary2, iRegI tmp1, iRegI tmp2, iRegI result, flagsReg ccr) %{
+  predicate(((AryEqNode*)n)->encoding() == StrIntrinsicNode::LL);
+  match(Set result (AryEq ary1 ary2));
+  effect(USE_KILL ary1, USE_KILL ary2, TEMP tmp1, TEMP tmp2, TEMP result, KILL ccr);
+
+  ins_cost(300);
+  format %{ "Array Equals $ary1,$ary2 -> $result   # KILL $tmp1,$tmp2" %}
+  ins_encode( enc_Array_Equals(ary1, ary2, tmp1, tmp2, result, (1), (true)));
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_compress(R2RegP src, R1RegP dst, R3RegI len,
+                         R9RegI tmp1, Q0_regD tmp2, Q1_regD tmp3, R12RegI tmp4, LRRegP lr, R0RegI result, flagsReg ccr)
+%{
+  match(Set result (StrCompressedCopy src (Binary dst len)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP tmp3, TEMP tmp4, TEMP lr, USE_KILL src, USE_KILL dst, USE_KILL len, KILL ccr);
+
+  format %{ "String Compress $src,$dst -> $result    // KILL $tmp1, $tmp2, $tmp3, $tmp4, $lr" %}
+  ins_encode( enc_Char_Array_Compress(src, dst, len, tmp1, tmp2, tmp3, tmp4, result, ccr) );
+  ins_pipe(long_memory_op);
+%}
+
+instruct string_inflate(Universe dummy, R0RegP src, R1RegP dst, R2RegI len,
+                        iRegI tmp1, Q0_regD tmp2, LRRegP lr, flagsReg ccr)
+%{
+  match(Set dummy (StrInflatedCopy src (Binary dst len)));
+  effect(TEMP tmp1, TEMP tmp2, TEMP lr, USE_KILL src, USE_KILL dst, USE_KILL len, KILL ccr);
+
+  format %{ "String Inflate $src,$dst    // KILL $tmp1, $tmp2, $lr" %}
+  ins_encode( enc_Byte_Array_Inflate(src, dst, len, tmp1, tmp2, ccr) );
+  ins_pipe(long_memory_op);
+%}
+
+//---------- Zeros Count Instructions ------------------------------------------
+
+instruct countLeadingZerosI(iRegI dst, iRegI src) %{
+  match(Set dst (CountLeadingZerosI src));
+  size(4);
+  format %{ "CLZ_32 $dst,$src" %}
+  ins_encode %{
+    __ clz($dst$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countLeadingZerosL(iRegI dst, iRegL src, iRegI tmp, flagsReg ccr) %{
+  match(Set dst (CountLeadingZerosL src));
+  effect(TEMP tmp, TEMP dst, KILL ccr);
+  size(16);
+  format %{ "CLZ    $dst,$src.hi\n\t"
+            "TEQ    $dst,32\n\t"
+            "CLZ.eq $tmp,$src.lo\n\t"
+            "ADD.eq $dst, $dst, $tmp\n\t" %}
+  ins_encode %{
+    __ clz($dst$$Register, $src$$Register->successor());
+    __ teq($dst$$Register, 32);
+    __ clz($tmp$$Register, $src$$Register, Assembler::EQ);
+    __ add($dst$$Register, $dst$$Register, $tmp$$Register, Assembler::EQ);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosI(iRegI dst, iRegI src, iRegI tmp) %{
+  match(Set dst (CountTrailingZerosI src));
+  effect(TEMP tmp);
+  size(8);
+  format %{ "RBIT_32 $tmp, $src\n\t"
+            "CLZ_32  $dst,$tmp" %}
+  ins_encode %{
+    __ rbit($tmp$$Register, $src$$Register);
+    __ clz($dst$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+instruct countTrailingZerosL(iRegI dst, iRegL src, iRegI tmp, flagsReg ccr) %{
+  match(Set dst (CountTrailingZerosL src));
+  effect(TEMP tmp, TEMP dst, KILL ccr);
+  size(24);
+  format %{ "RBIT   $tmp,$src.lo\n\t"
+            "CLZ    $dst,$tmp\n\t"
+            "TEQ    $dst,32\n\t"
+            "RBIT   $tmp,$src.hi\n\t"
+            "CLZ.eq $tmp,$tmp\n\t"
+            "ADD.eq $dst,$dst,$tmp\n\t" %}
+  ins_encode %{
+    __ rbit($tmp$$Register, $src$$Register);
+    __ clz($dst$$Register, $tmp$$Register);
+    __ teq($dst$$Register, 32);
+    __ rbit($tmp$$Register, $src$$Register->successor());
+    __ clz($tmp$$Register, $tmp$$Register, Assembler::EQ);
+    __ add($dst$$Register, $dst$$Register, $tmp$$Register, Assembler::EQ);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+
+//---------- Population Count Instructions -------------------------------------
+
+instruct popCountI(iRegI dst, iRegI src, regD_low tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountI src));
+  effect(TEMP tmp);
+
+  format %{ "FMSR       $tmp,$src\n\t"
+            "VCNT.8     $tmp,$tmp\n\t"
+            "VPADDL.U8  $tmp,$tmp\n\t"
+            "VPADDL.U16 $tmp,$tmp\n\t"
+            "FMRS       $dst,$tmp" %}
+  size(20);
+
+  ins_encode %{
+    __ vmov_f32($tmp$$FloatRegister, $src$$Register);
+    __ vcnt_64($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl_64_u8($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl_64_u16($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vmov_f32($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Note: Long.bitCount(long) returns an int.
+instruct popCountL(iRegI dst, iRegL src, regD_low tmp) %{
+  predicate(UsePopCountInstruction);
+  match(Set dst (PopCountL src));
+  effect(TEMP tmp);
+
+  format %{ "FMDRR       $tmp,$src.lo,$src.hi\n\t"
+            "VCNT.8      $tmp,$tmp\n\t"
+            "VPADDL.U8   $tmp,$tmp\n\t"
+            "VPADDL.U16  $tmp,$tmp\n\t"
+            "VPADDL.U32  $tmp,$tmp\n\t"
+            "FMRS        $dst,$tmp" %}
+
+  size(32);
+
+  ins_encode %{
+    __ vmov_f64($tmp$$FloatRegister, $src$$Register, $src$$Register->successor());
+    __ vcnt_64($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl_64_u8($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl_64_u16($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vpaddl_64_u32($tmp$$FloatRegister, $tmp$$FloatRegister);
+    __ vmov_f32($dst$$Register, $tmp$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg);
+%}
+
+
+// ============================================================================
+//------------Bytes reverse--------------------------------------------------
+
+instruct bytes_reverse_int(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesI src));
+
+  size(4);
+  format %{ "REV32 $dst,$src" %}
+  ins_encode %{
+    __ rev($dst$$Register, $src$$Register);
+  %}
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+instruct bytes_reverse_long(iRegL dst, iRegL src) %{
+  match(Set dst (ReverseBytesL src));
+  effect(TEMP dst);
+  size(8);
+  format %{ "REV $dst.lo,$src.lo\n\t"
+            "REV $dst.hi,$src.hi" %}
+  ins_encode %{
+    __ rev($dst$$Register, $src$$Register->successor());
+    __ rev($dst$$Register->successor(), $src$$Register);
+  %}
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+instruct bytes_reverse_unsigned_short(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesUS src));
+  size(4);
+  format %{ "REV16 $dst,$src" %}
+  ins_encode %{
+    __ rev16($dst$$Register, $src$$Register);
+  %}
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+instruct bytes_reverse_short(iRegI dst, iRegI src) %{
+  match(Set dst (ReverseBytesS src));
+  size(4);
+  format %{ "REVSH $dst,$src" %}
+  ins_encode %{
+    __ revsh($dst$$Register, $src$$Register);
+  %}
+  ins_pipe( iload_mem ); // FIXME
+%}
+
+
+// ====================VECTOR INSTRUCTIONS=====================================
+
+// Load Aligned Packed values into a Double Register
+instruct loadV8(vecD dst, memoryD mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 8);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FLDD   $mem,$dst\t! load vector (8 bytes)" %}
+  ins_encode %{
+    __ vldr_f64($dst$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(floadD_mem);
+%}
+
+// Load Aligned Packed values into a Double Register Pair
+instruct loadV16(vecX dst, memoryvld mem) %{
+  predicate(n->as_LoadVector()->memory_size() == 16);
+  match(Set dst (LoadVector mem));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "VLD1   $mem,$dst.Q\t! load vector (16 bytes)" %}
+  ins_encode %{
+    __ vld1_16($dst$$FloatRegister, $dst$$FloatRegister->successor(FloatRegisterImpl::DOUBLE), $mem$$Address, Assembler::ALIGN_STD);
+  %}
+  ins_pipe(floadD_mem); // FIXME
+%}
+
+// Store Vector in Double register to memory
+instruct storeV8(memoryD mem, vecD src) %{
+  predicate(n->as_StoreVector()->memory_size() == 8);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "FSTD   $src,$mem\t! store vector (8 bytes)" %}
+  ins_encode %{
+    __ vstr_f64($src$$FloatRegister, $mem$$Address);
+  %}
+  ins_pipe(fstoreD_mem_reg);
+%}
+
+// Store Vector in Double Register Pair to memory
+instruct storeV16(memoryvld mem, vecX src) %{
+  predicate(n->as_StoreVector()->memory_size() == 16);
+  match(Set mem (StoreVector mem src));
+  ins_cost(MEMORY_REF_COST);
+  size(4);
+  format %{ "VST1   $src,$mem\t! store vector (16 bytes)" %}
+  ins_encode %{
+    __ vst1_16($src$$FloatRegister, $src$$FloatRegister->successor(FloatRegisterImpl::DOUBLE), $mem$$Address, Assembler::ALIGN_STD);
+  %}
+  ins_pipe(fstoreD_mem_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg(vecD dst, iRegI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(DEFAULT_COST*4);
+  effect(TEMP tmp);
+  size(16);
+
+  // FIXME: could use PKH instruction instead?
+  format %{ "LSL      $tmp, $src, 24 \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 8) \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 16) \n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ mov($tmp$$Register, $src$$Register, lsl(24));
+    __ orr($tmp$$Register, $tmp$$Register, $tmp$$Register, lsr(8));
+    __ orr($tmp$$Register, $tmp$$Register, $tmp$$Register, lsr(16));
+    __ vmov_f64($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl8B_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VDUP.8 $dst,$src\t" %}
+  ins_encode %{
+    __ vdup_64_8($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl16B_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VDUP.8 $dst.Q,$src\t" %}
+  ins_encode %{
+    __ vdup_128_8($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl8B_immI(vecD dst, immI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (ReplicateB src));
+  ins_cost(DEFAULT_COST*2);
+  effect(TEMP tmp);
+  size(12);
+
+  format %{ "MOV      $tmp, Repl4($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (4), (1)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+// TODO: support negative constants with MVNI?
+instruct Repl8B_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VMOV.U8  $dst,$src" %}
+  ins_encode %{
+    __ vmov_64_8($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl16B_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateB src));
+  size(4);
+
+  format %{ "VMOV.U8  $dst.Q,$src" %}
+  ins_encode %{
+    __ vmov_128_8($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar to packed short/char values into Double register
+instruct Repl4S_reg(vecD dst, iRegI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  ins_cost(DEFAULT_COST*3);
+  effect(TEMP tmp);
+  size(12);
+
+  // FIXME: could use PKH instruction instead?
+  format %{ "LSL      $tmp, $src, 16 \n\t"
+            "OR       $tmp, $tmp, ($tmp >> 16) \n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ mov($tmp$$Register, $src$$Register, lsl(16));
+    __ orr($tmp$$Register, $tmp$$Register, $tmp$$Register, lsr(16));
+    __ vmov_f64($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register
+instruct Repl4S_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VDUP.16 $dst,$src\t" %}
+  ins_encode %{
+    __ vdup_64_16($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl8S_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VDUP.16 $dst.Q,$src\t" %}
+  ins_encode %{
+    __ vdup_128_16($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+// Replicate scalar constant to packed short/char values in Double register
+instruct Repl4S_immI(vecD dst, immI src, iRegP tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateS src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl2($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (2), (2)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl4S_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VMOV.U16  $dst,$src" %}
+  ins_encode %{
+    __ vmov_64_16($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl8S_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateS src));
+  size(4);
+
+  format %{ "VMOV.U16  $dst.Q,$src" %}
+  ins_encode %{
+    __ vmov_128_16($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "FMDRR    $dst,$src,$src\t" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register pair
+instruct Repl4I_reg(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateI src));
+  ins_cost(DEFAULT_COST*2);
+  size(8);
+
+  format %{ "FMDRR    $dst.lo,$src,$src\n\t"
+            "FMDRR    $dst.hi,$src,$src" %}
+
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register);
+    __ vmov_f64($dst$$FloatRegister->successor(FloatRegisterImpl::DOUBLE),
+             $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register
+instruct Repl2I_reg_simd(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VDUP.32 $dst.D,$src\t" %}
+  ins_encode %{
+    __ vdup_64_32($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed int values in Double register pair
+instruct Repl4I_reg_simd(vecX dst, iRegI src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VDUP.32 $dst.Q,$src\t" %}
+  ins_encode %{
+    __ vdup_128_32($dst$$FloatRegister, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+// Replicate scalar zero constant to packed int values in Double register
+instruct Repl2I_immI(vecD dst, immI src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateI src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl1($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmI(src, dst, tmp, (1), (4)) );
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register
+instruct Repl2I_immU8(vecD dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VMOV.I32  $dst.D,$src" %}
+  ins_encode %{
+    __ vmov_64_32($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar constant to packed byte values in Double register pair
+instruct Repl4I_immU8(vecX dst, immU8 src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateI src));
+  size(4);
+
+  format %{ "VMOV.I32  $dst.Q,$src" %}
+  ins_encode %{
+    __ vmov_128_32($dst$$FloatRegister, $src$$constant);
+  %}
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar to packed byte values in Double register pair
+instruct Repl2L_reg(vecX dst, iRegL src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateL src));
+  size(8);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMDRR $dst.D,$src.lo,$src.hi\t\n"
+            "FMDRR $dst.D.next,$src.lo,$src.hi" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register->successor());
+    __ vmov_f64($dst$$FloatRegister->successor(FloatRegisterImpl::DOUBLE),
+             $src$$Register, $src$$Register->successor());
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_regI(vecD dst, iRegI src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  size(4);
+
+  format %{ "FMDRR    $dst.D,$src,$src\t" %}
+  ins_encode %{
+    __ vmov_f64($dst$$FloatRegister, $src$$Register, $src$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_reg_vfp(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  expand %{
+    iRegI tmp;
+    MoveF2I_reg_reg(tmp, src);
+    Repl2F_regI(dst,tmp);
+  %}
+%}
+
+// Replicate scalar to packed float values in Double register
+instruct Repl2F_reg_simd(vecD dst, regF src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateF src));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+
+  format %{ "VDUP.32  $dst.D,$src.D\t" %}
+  ins_encode %{
+    __ vdups_64($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed float values in Double register pair
+instruct Repl4F_reg(vecX dst, regF src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (ReplicateF src));
+  effect(TEMP tmp);
+  size(4*3);
+  ins_cost(DEFAULT_COST*3); // FIXME
+
+  format %{ "FMRS     $tmp,$src\n\t"
+            "FMDRR    $dst.D,$tmp,$tmp\n\t"
+            "FMDRR    $dst.D.next,$tmp,$tmp\t" %}
+  ins_encode %{
+    __ vmov_f32($tmp$$Register, $src$$FloatRegister);
+    __ vmov_f64($dst$$FloatRegister, $tmp$$Register, $tmp$$Register);
+    __ vmov_f64($dst$$FloatRegister->successor(FloatRegisterImpl::DOUBLE),
+             $tmp$$Register, $tmp$$Register);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar to packed float values in Double register pair
+instruct Repl4F_reg_simd(vecX dst, regF src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (ReplicateF src));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+
+  format %{ "VDUP.32  $dst.Q,$src.D\t" %}
+  ins_encode %{
+    __ vdups_128($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// Replicate scalar zero constant to packed float values in Double register
+instruct Repl2F_immI(vecD dst, immF src, iRegI tmp) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateF src));
+  effect(TEMP tmp);
+  size(12);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "MOV      $tmp, Repl1($src))\n\t"
+            "FMDRR    $dst,$tmp,$tmp\t" %}
+  ins_encode( LdReplImmF(src, dst, tmp) );
+  ins_pipe(loadConFD); // FIXME
+%}
+
+// Replicate scalar to packed double float values in Double register pair
+instruct Repl2D_reg(vecX dst, regD src) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (ReplicateD src));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FCPYD    $dst.D.a,$src\n\t"
+            "FCPYD    $dst.D.b,$src\t" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src = $src$$FloatRegister;
+    __ vmov_f64(dsta, src);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::DOUBLE);
+    __ vmov_f64(dstb, src);
+  %}
+  ins_pipe(ialu_reg); // FIXME
+%}
+
+// ====================VECTOR ARITHMETIC=======================================
+
+// --------------------------------- ADD --------------------------------------
+
+// Bytes vector add
+instruct vadd8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVB src1 src2));
+  format %{ "VADD.I8 $dst,$src1,$src2\t! add packed8B" %}
+  size(4);
+  ins_encode %{
+    __ vadd_64_8($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (AddVB src1 src2));
+  size(4);
+  format %{ "VADD.I8 $dst.Q,$src1.Q,$src2.Q\t! add packed16B" %}
+  ins_encode %{
+    __ vadd_128_8($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector add
+instruct vadd4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVS src1 src2));
+  size(4);
+  format %{ "VADD.I16 $dst,$src1,$src2\t! add packed4S" %}
+  ins_encode %{
+    __ vadd_64_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (AddVS src1 src2));
+  size(4);
+  format %{ "VADD.I16 $dst.Q,$src1.Q,$src2.Q\t! add packed8S" %}
+  ins_encode %{
+    __ vadd_128_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector add
+instruct vadd2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVI src1 src2));
+  size(4);
+  format %{ "VADD.I32 $dst.D,$src1.D,$src2.D\t! add packed2I" %}
+  ins_encode %{
+    __ vadd_64_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vadd4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVI src1 src2));
+  size(4);
+  format %{ "VADD.I32 $dst.Q,$src1.Q,$src2.Q\t! add packed4I" %}
+  ins_encode %{
+    __ vadd_128_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector add
+instruct vadd2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVL src1 src2));
+  size(4);
+  format %{ "VADD.I64 $dst.Q,$src1.Q,$src2.Q\t! add packed2L" %}
+  ins_encode %{
+    bool quad = true;
+    __ vadd_128_64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector add
+instruct vadd2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVF src1 src2));
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  size(4*2);
+  format %{ "FADDS  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ vadd_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ vadd_f32($dst$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src1$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src2$$FloatRegister->successor(FloatRegisterImpl::SINGLE));
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+instruct vadd4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (AddVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FADDS  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDS  $dst.b,$src1.b,$src2.b\n\t"
+            "FADDS  $dst.c,$src1.c,$src2.c\n\t"
+            "FADDS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vadd_f32(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::SINGLE);
+    __ vadd_f32(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1c = src1b->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2c = src2b->successor(FloatRegisterImpl::SINGLE);
+    __ vadd_f32(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1d = src1c->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2d = src2c->successor(FloatRegisterImpl::SINGLE);
+    __ vadd_f32(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+instruct vadd2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (AddVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FADDD  $dst.a,$src1.a,$src2.a\n\t"
+            "FADDD  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vadd_f64(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::DOUBLE);
+    __ vadd_f64(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+
+// Bytes vector sub
+instruct vsub8B_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVB src1 src2));
+  size(4);
+  format %{ "VSUB.I8 $dst,$src1,$src2\t! sub packed8B" %}
+  ins_encode %{
+    __ vsub_64_8($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub16B_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (SubVB src1 src2));
+  size(4);
+  format %{ "VSUB.I8 $dst.Q,$src1.Q,$src2.Q\t! sub packed16B" %}
+  ins_encode %{
+    __ vsub_128_8($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector sub
+instruct vsub4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVS src1 src2));
+  size(4);
+  format %{ "VSUB.I16 $dst,$src1,$src2\t! sub packed4S" %}
+  ins_encode %{
+    __ vsub_64_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub16S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (SubVS src1 src2));
+  size(4);
+  format %{ "VSUB.I16 $dst.Q,$src1.Q,$src2.Q\t! sub packed8S" %}
+  ins_encode %{
+    __ vsub_128_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector sub
+instruct vsub2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVI src1 src2));
+  size(4);
+  format %{ "VSUB.I32 $dst,$src1,$src2\t! sub packed2I" %}
+  ins_encode %{
+    __ vsub_64_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsub4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVI src1 src2));
+  size(4);
+  format %{ "VSUB.I32 $dst.Q,$src1.Q,$src2.Q\t! sub packed4I" %}
+  ins_encode %{
+    bool quad = true;
+    __ vsub_128_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector sub
+instruct vsub2L_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVL src1 src2));
+  size(4);
+  format %{ "VSUB.I64 $dst.Q,$src1.Q,$src2.Q\t! sub packed2L" %}
+  ins_encode %{
+    __ vsub_128_64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector sub
+instruct vsub2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVF src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FSUBS  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBS  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vsub_f32(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::SINGLE);
+    __ vsub_f32(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+
+instruct vsub4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (SubVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FSUBS  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBS  $dst.b,$src1.b,$src2.b\n\t"
+            "FSUBS  $dst.c,$src1.c,$src2.c\n\t"
+            "FSUBS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vsub_f32(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::SINGLE);
+    __ vsub_f32(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1c = src1b->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2c = src2b->successor(FloatRegisterImpl::SINGLE);
+    __ vsub_f32(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1d = src1c->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2d = src2c->successor(FloatRegisterImpl::SINGLE);
+    __ vsub_f32(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+instruct vsub2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (SubVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FSUBD  $dst.a,$src1.a,$src2.a\n\t"
+            "FSUBD  $dst.b,$src1.b,$src2.b" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vsub_f64(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::DOUBLE);
+    __ vsub_f64(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(faddF_reg_reg); // FIXME
+%}
+
+// Shorts/Chars vector mul
+instruct vmul4S_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVS src1 src2));
+  size(4);
+  format %{ "VMUL.I16 $dst,$src1,$src2\t! mul packed4S" %}
+  ins_encode %{
+    __ vmul_64_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vmul8S_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (MulVS src1 src2));
+  size(4);
+  format %{ "VMUL.I16 $dst.Q,$src1.Q,$src2.Q\t! mul packed8S" %}
+  ins_encode %{
+    __ vmul_128_16($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector mul
+instruct vmul2I_reg(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVI src1 src2));
+  size(4);
+  format %{ "VMUL.I32 $dst,$src1,$src2\t! mul packed2I" %}
+  ins_encode %{
+    __ vmul_64_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vmul4I_reg(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVI src1 src2));
+  size(4);
+  format %{ "VMUL.I32 $dst.Q,$src1.Q,$src2.Q\t! mul packed4I" %}
+  ins_encode %{
+    __ vmul_128_32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Floats vector mul
+instruct vmul2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVF src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMULS  $dst.a,$src1.a,$src2.a\n\t"
+            "FMULS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ vmul_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ vmul_f32($dst$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src1$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src2$$FloatRegister->successor(FloatRegisterImpl::SINGLE));
+  %}
+
+  ins_pipe(fmulF_reg_reg); // FIXME
+%}
+
+instruct vmul4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (MulVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FMULS  $dst.a,$src1.a,$src2.a\n\t"
+            "FMULS  $dst.b,$src1.b,$src2.b\n\t"
+            "FMULS  $dst.c,$src1.c,$src2.c\n\t"
+            "FMULS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vmul_f32(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::SINGLE);
+    __ vmul_f32(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1c = src1b->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2c = src2b->successor(FloatRegisterImpl::SINGLE);
+    __ vmul_f32(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1d = src1c->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2d = src2c->successor(FloatRegisterImpl::SINGLE);
+    __ vmul_f32(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(fmulF_reg_reg); // FIXME
+%}
+
+instruct vmul2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (MulVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FMULD  $dst.D.a,$src1.D.a,$src2.D.a\n\t"
+            "FMULD  $dst.D.b,$src1.D.b,$src2.D.b" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vmul_f64(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::DOUBLE);
+    __ vmul_f64(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(fmulD_reg_reg); // FIXME
+%}
+
+
+// Floats vector div
+instruct vdiv2F_reg_vfp(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVF src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FDIVS  $dst.a,$src1.a,$src2.a\n\t"
+            "FDIVS  $dst.b,$src1.b,$src2.b" %}
+  ins_encode %{
+    __ vdiv_f32($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+    __ vdiv_f32($dst$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src1$$FloatRegister->successor(FloatRegisterImpl::SINGLE),
+             $src2$$FloatRegister->successor(FloatRegisterImpl::SINGLE));
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+%}
+
+instruct vdiv4F_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (DivVF src1 src2));
+  size(4*4);
+  ins_cost(DEFAULT_COST*4); // FIXME
+
+  format %{ "FDIVS  $dst.a,$src1.a,$src2.a\n\t"
+            "FDIVS  $dst.b,$src1.b,$src2.b\n\t"
+            "FDIVS  $dst.c,$src1.c,$src2.c\n\t"
+            "FDIVS  $dst.d,$src1.d,$src2.d" %}
+
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vdiv_f32(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::SINGLE);
+    __ vdiv_f32(dstb, src1b, src2b);
+    FloatRegister dstc = dstb->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1c = src1b->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2c = src2b->successor(FloatRegisterImpl::SINGLE);
+    __ vdiv_f32(dstc, src1c, src2c);
+    FloatRegister dstd = dstc->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src1d = src1c->successor(FloatRegisterImpl::SINGLE);
+    FloatRegister src2d = src2c->successor(FloatRegisterImpl::SINGLE);
+    __ vdiv_f32(dstd, src1d, src2d);
+  %}
+
+  ins_pipe(fdivF_reg_reg); // FIXME
+%}
+
+instruct vdiv2D_reg_vfp(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (DivVD src1 src2));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "FDIVD  $dst.D.a,$src1.D.a,$src2.D.a\n\t"
+            "FDIVD  $dst.D.b,$src1.D.b,$src2.D.b" %}
+  ins_encode %{
+    FloatRegister dsta = $dst$$FloatRegister;
+    FloatRegister src1a = $src1$$FloatRegister;
+    FloatRegister src2a = $src2$$FloatRegister;
+    __ vdiv_f64(dsta, src1a, src2a);
+    FloatRegister dstb = dsta->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src1b = src1a->successor(FloatRegisterImpl::DOUBLE);
+    FloatRegister src2b = src2a->successor(FloatRegisterImpl::DOUBLE);
+    __ vdiv_f64(dstb, src1b, src2b);
+  %}
+
+  ins_pipe(fdivD_reg_reg); // FIXME
+%}
+
+// --------------------------------- NEG --------------------------------------
+
+instruct vneg8B_reg(vecD dst, vecD src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  effect(DEF dst, USE src);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{ "VNEG.S8 $dst.D,$src.D\t! neg packed8B" %}
+  ins_encode %{
+    __ vneg_64_s8($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vneg16B_reg(vecX dst, vecX src) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  effect(DEF dst, USE src);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{ "VNEG.S8 $dst.Q,$src.Q\t! neg0 packed16B" %}
+  ins_encode %{
+    __ vneg_128_s8($dst$$FloatRegister, $src$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------------------ Shift ---------------------------------------
+
+instruct vslcntD(vecD dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftCntV cnt));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    Repl8B_reg_simd(dst, cnt);
+  %}
+%}
+
+instruct vslcntX(vecX dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftCntV cnt));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    Repl16B_reg(dst, cnt);
+  %}
+%}
+
+// Low bits of vector "shift" elements are used, so it
+// doesn't matter if we treat it as ints or bytes here.
+instruct vsrcntD(vecD dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (RShiftCntV cnt));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+
+  format %{ "VDUP.8 $dst.D,$cnt\n\t"
+            "VNEG.S8 $dst.D,$dst.D\t! neg packed8B" %}
+  ins_encode %{
+    __ vdup_64_8($dst$$FloatRegister, $cnt$$Register);
+    __ vneg_64_s8($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrcntX(vecX dst, iRegI cnt) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (RShiftCntV cnt));
+  size(4*2);
+  ins_cost(DEFAULT_COST*2); // FIXME
+  format %{ "VDUP.8 $dst.Q,$cnt\n\t"
+            "VNEG.S8 $dst.Q,$dst.Q\t! neg packed16B" %}
+  ins_encode %{
+    __ vdup_128_8($dst$$FloatRegister, $cnt$$Register);
+    __ vneg_128_s8($dst$$FloatRegister, $dst$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Byte vector logical left/right shift based on sign
+instruct vsh8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U8 $dst.D,$src.D,$shift.D\t! logical left/right shift packed8B"
+  %}
+  ins_encode %{
+    __ vshl_64_u8($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U8 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshl_128_u8($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Char vector logical left/right shift based on sign
+instruct vsh4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U16 $dst.D,$src.D,$shift.D\t! logical left/right shift packed4S"
+  %}
+  ins_encode %{
+    __ vshl_64_u16($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U16 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed8S"
+  %}
+  ins_encode %{
+    __ vshl_128_u16($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical left/right shift based on sign
+instruct vsh2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U32 $dst.D,$src.D,$shift.D\t! logical left/right shift packed2I"
+  %}
+  ins_encode %{
+    __ vshl_64_u32($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsh4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U32 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed4I"
+  %}
+  ins_encode %{
+    __ vshl_128_u32($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical left/right shift based on sign
+instruct vsh2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.U64 $dst.Q,$src.Q,$shift.Q\t! logical left/right shift packed2L"
+  %}
+  ins_encode %{
+    __ vshl_128_u64($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------------------ LeftShift -----------------------------------
+
+// Byte vector left shift
+instruct vsl8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh8B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh16B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl8B_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I8 $dst.D,$src.D,$shift\t! logical left shift packed8B"
+  %}
+  ins_encode %{
+    __ vshl_64_8($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl16B_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (LShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I8 $dst.Q,$src.Q,$shift\t! logical left shift packed16B"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshl_128_8($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts/Chars vector logical left/right shift
+instruct vsl4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (URShiftVS src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh4S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  match(Set dst (URShiftVS src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh8S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (LShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I16 $dst.D,$src.D,$shift\t! logical left shift packed4S"
+  %}
+  ins_encode %{
+    bool quad = false;
+    __ vshl_64_16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (LShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I16 $dst.Q,$src.Q,$shift\t! logical left shift packed8S"
+  %}
+  ins_encode %{
+    bool quad = true;
+    __ vshl_128_16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical left/right shift
+instruct vsl2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (URShiftVI src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh2I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftVI src shift));
+  match(Set dst (URShiftVI src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh4I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I32 $dst.D,$src.D,$shift\t! logical left shift packed2I"
+  %}
+  ins_encode %{
+    __ vshl_64_32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsl4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (LShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I32 $dst.Q,$src.Q,$shift\t! logical left shift packed4I"
+  %}
+  ins_encode %{
+    __ vshl_128_32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical left/right shift
+instruct vsl2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  match(Set dst (URShiftVL src shift));
+  size(4*1);
+  ins_cost(DEFAULT_COST*1); // FIXME
+  expand %{
+    vsh2L_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsl2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (LShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.I64 $dst.Q,$src.Q,$shift\t! logical left shift packed2L"
+  %}
+  ins_encode %{
+    __ vshl_128_64($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ----------------------- LogicalRightShift -----------------------------------
+
+// Bytes/Shorts vector logical right shift produces incorrect Java result
+// for negative data because java code convert short value into int with
+// sign extension before a shift.
+
+// Chars vector logical right shift
+instruct vsrl4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (URShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U16 $dst.D,$src.D,$shift\t! logical right shift packed4S"
+  %}
+  ins_encode %{
+    __ vshr_64_u16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (URShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U16 $dst.Q,$src.Q,$shift\t! logical right shift packed8S"
+  %}
+  ins_encode %{
+    __ vshr_128_u16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector logical right shift
+instruct vsrl2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (URShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U32 $dst.D,$src.D,$shift\t! logical right shift packed2I"
+  %}
+  ins_encode %{
+    __ vshr_64_u32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4 && (VM_Version::features() & FT_AdvSIMD));
+  match(Set dst (URShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U32 $dst.Q,$src.Q,$shift\t! logical right shift packed4I"
+  %}
+  ins_encode %{
+    __ vshr_128_u32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector logical right shift
+instruct vsrl2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (URShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.U64 $dst.Q,$src.Q,$shift\t! logical right shift packed2L"
+  %}
+  ins_encode %{
+    __ vshr_128_u64($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// ------------------- ArithmeticRightShift -----------------------------------
+
+// Bytes vector arithmetic left/right shift based on sign
+instruct vsha8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S8 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed8B"
+  %}
+  ins_encode %{
+    __ vshl_64_s8($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S8 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed16B"
+  %}
+  ins_encode %{
+    __ vshl_128_s8($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts vector arithmetic left/right shift based on sign
+instruct vsha4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S16 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed4S"
+  %}
+  ins_encode %{
+    __ vshl_64_s16($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S16 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed8S"
+  %}
+  ins_encode %{
+    __ vshl_128_s16($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector arithmetic left/right shift based on sign
+instruct vsha2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S32 $dst.D,$src.D,$shift.D\t! arithmetic right shift packed2I"
+  %}
+  ins_encode %{
+    __ vshl_64_s32($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsha4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S32 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed4I"
+  %}
+  ins_encode %{
+    __ vshl_128_s32($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector arithmetic left/right shift based on sign
+instruct vsha2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  effect(DEF dst, USE src, USE shift);
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHL.S64 $dst.Q,$src.Q,$shift.Q\t! arithmetic right shift packed2L"
+  %}
+  ins_encode %{
+    __ vshl_128_s64($dst$$FloatRegister, $src$$FloatRegister, $shift$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Byte vector arithmetic right shift
+
+instruct vsra8B_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha8B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsrl16B_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha16B_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsrl8B_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S8 $dst.D,$src.D,$shift\t! logical right shift packed8B"
+  %}
+  ins_encode %{
+    __ vshr_64_s8($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsrl16B_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 16);
+  match(Set dst (RShiftVB src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S8 $dst.Q,$src.Q,$shift\t! logical right shift packed16B"
+  %}
+  ins_encode %{
+    __ vshr_128_s8($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Shorts vector arithmetic right shift
+instruct vsra4S_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha4S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra8S_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha8S_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra4S_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S16 $dst.D,$src.D,$shift\t! logical right shift packed4S"
+  %}
+  ins_encode %{
+    __ vshr_64_s16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsra8S_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 8);
+  match(Set dst (RShiftVS src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S16 $dst.Q,$src.Q,$shift\t! logical right shift packed8S"
+  %}
+  ins_encode %{
+    __ vshr_128_s16($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Integers vector arithmetic right shift
+instruct vsra2I_reg(vecD dst, vecD src, vecD shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha2I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra4I_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha4I_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra2I_immI(vecD dst, vecD src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S32 $dst.D,$src.D,$shift\t! logical right shift packed2I"
+  %}
+  ins_encode %{
+    __ vshr_64_s32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vsra4I_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 4);
+  match(Set dst (RShiftVI src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S32 $dst.Q,$src.Q,$shift\t! logical right shift packed4I"
+  %}
+  ins_encode %{
+    __ vshr_128_s32($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// Longs vector arithmetic right shift
+instruct vsra2L_reg(vecX dst, vecX src, vecX shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  expand %{
+    vsha2L_reg(dst, src, shift);
+  %}
+%}
+
+instruct vsra2L_immI(vecX dst, vecX src, immI shift) %{
+  predicate(n->as_Vector()->length() == 2);
+  match(Set dst (RShiftVL src shift));
+  size(4);
+  ins_cost(DEFAULT_COST); // FIXME
+  format %{
+    "VSHR.S64 $dst.Q,$src.Q,$shift\t! logical right shift packed2L"
+  %}
+  ins_encode %{
+    __ vshr_128_s64($dst$$FloatRegister, $src$$FloatRegister, $shift$$constant);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- AND --------------------------------------
+
+instruct vandD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (AndV src1 src2));
+  format %{ "VAND    $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    __ vand_64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vandX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (AndV src1 src2));
+  format %{ "VAND    $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    bool quad = true;
+    __ vand_128($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- OR ---------------------------------------
+
+instruct vorD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (OrV src1 src2));
+  format %{ "VOR     $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    __ vorr_64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vorX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (OrV src1 src2));
+  format %{ "VOR     $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    __ vorr_128($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+// --------------------------------- XOR --------------------------------------
+
+instruct vxorD(vecD dst, vecD src1, vecD src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 8);
+  match(Set dst (XorV src1 src2));
+  format %{ "VXOR    $dst.D,$src1.D,$src2.D\t! and vectors (8 bytes)" %}
+  ins_encode %{
+    __ veor_64($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+instruct vxorX(vecX dst, vecX src1, vecX src2) %{
+  predicate(n->as_Vector()->length_in_bytes() == 16);
+  match(Set dst (XorV src1 src2));
+  format %{ "VXOR    $dst.Q,$src1.Q,$src2.Q\t! and vectors (16 bytes)" %}
+  ins_encode %{
+    __ veor_128($dst$$FloatRegister, $src1$$FloatRegister, $src2$$FloatRegister);
+  %}
+  ins_pipe( ialu_reg_reg ); // FIXME
+%}
+
+
+//----------PEEPHOLE RULES-----------------------------------------------------
+// These must follow all instruction definitions as they use the names
+// defined in the instructions definitions.
+//
+// peepmatch ( root_instr_name [preceding_instruction]* );
+//
+// peepconstraint %{
+// (instruction_number.operand_name relational_op instruction_number.operand_name
+//  [, ...] );
+// // instruction numbers are zero-based using left to right order in peepmatch
+//
+// peepreplace ( instr_name  ( [instruction_number.operand_name]* ) );
+// // provide an instruction_number.operand_name for each operand that appears
+// // in the replacement instruction's match rule
+//
+// ---------VM FLAGS---------------------------------------------------------
+//
+// All peephole optimizations can be turned off using -XX:-OptoPeephole
+//
+// Each peephole rule is given an identifying number starting with zero and
+// increasing by one in the order seen by the parser.  An individual peephole
+// can be enabled, and all others disabled, by using -XX:OptoPeepholeAt=#
+// on the command-line.
+//
+// ---------CURRENT LIMITATIONS----------------------------------------------
+//
+// Only match adjacent instructions in same basic block
+// Only equality constraints
+// Only constraints between operands, not (0.dest_reg == EAX_enc)
+// Only one replacement instruction
+//
+// ---------EXAMPLE----------------------------------------------------------
+//
+// // pertinent parts of existing instructions in architecture description
+// instruct movI(eRegI dst, eRegI src) %{
+//   match(Set dst (CopyI src));
+// %}
+//
+// instruct incI_eReg(eRegI dst, immI1 src, eFlagsReg cr) %{
+//   match(Set dst (AddI dst src));
+//   effect(KILL cr);
+// %}
+//
+// // Change (inc mov) to lea
+// peephole %{
+//   // increment preceeded by register-register move
+//   peepmatch ( incI_eReg movI );
+//   // require that the destination register of the increment
+//   // match the destination register of the move
+//   peepconstraint ( 0.dst == 1.dst );
+//   // construct a replacement instruction that sets
+//   // the destination to ( move's source register + one )
+//   peepreplace ( incI_eReg_immI1( 0.dst 1.src 0.src ) );
+// %}
+//
+
+// // Change load of spilled value to only a spill
+// instruct storeI(memory mem, eRegI src) %{
+//   match(Set mem (StoreI mem src));
+// %}
+//
+// instruct loadI(eRegI dst, memory mem) %{
+//   match(Set dst (LoadI mem));
+// %}
+//
+// peephole %{
+//   peepmatch ( loadI storeI );
+//   peepconstraint ( 1.src == 0.dst, 1.mem == 0.mem );
+//   peepreplace ( storeI( 1.mem 1.mem 1.src ) );
+// %}
+
+//----------SMARTSPILL RULES---------------------------------------------------
+// These must follow all instruction definitions as they use the names
+// defined in the instructions definitions.
+//
+// ARM will probably not have any of these rules due to RISC instruction set.
+
+//----------PIPELINE-----------------------------------------------------------
+// Rules which define the behavior of the target architectures pipeline.
--- /dev/null	2018-09-25 19:24:07.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/aarch32Test.cpp	2018-09-25 19:24:07.000000000 +0300
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <stdlib.h>
+
+#include "precompiled.hpp"
+#include "code/codeBlob.hpp"
+#include "asm/macroAssembler.hpp"
+
+// hook routine called during JVM bootstrap to test AArch32 assembler
+
+extern "C" void entry(CodeBuffer*);
+
+void aarch32TestHook()
+{
+  BufferBlob* b = BufferBlob::create("aarch32Test", 500000);
+  CodeBuffer code(b);
+  MacroAssembler _masm(&code);
+  entry(&code);
+}
--- /dev/null	2018-09-25 19:24:09.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/abstractInterpreter_aarch32.cpp	2018-09-25 19:24:08.000000000 +0300
@@ -0,0 +1,161 @@
+/*
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "oops/method.hpp"
+#include "runtime/frame.inline.hpp"
+#include "utilities/debug.hpp"
+
+int AbstractInterpreter::BasicType_as_index(BasicType type) {
+  int i = 0;
+  switch (type) {
+    case T_BOOLEAN: i = 0; break;
+    case T_CHAR   : i = 1; break;
+    case T_BYTE   : i = 2; break;
+    case T_SHORT  : i = 3; break;
+    case T_INT    : i = 4; break;
+    case T_LONG   : i = 5; break;
+    case T_VOID   : i = 6; break;
+    case T_FLOAT  : i = 7; break;
+    case T_DOUBLE : i = 8; break;
+    case T_OBJECT : i = 9; break;
+    case T_ARRAY  : i = 9; break;
+    default       : ShouldNotReachHere();
+  }
+  assert(0 <= i && i < AbstractInterpreter::number_of_result_handlers,
+         "index out of bounds");
+  return i;
+}
+
+// How much stack a method activation needs in words.
+int AbstractInterpreter::size_top_interpreter_activation(Method* method) {
+  const int entry_size = frame::interpreter_frame_monitor_size();
+
+  // total overhead size: entry_size + (saved rfp thru expr stack
+  // bottom).  be sure to change this if you add/subtract anything
+  // to/from the overhead area
+  const int overhead_size =
+    -(frame::get_interpreter_frame_initial_sp_offset()) + entry_size;
+
+  const int stub_code = frame::get_entry_frame_after_call_words();
+  const int method_stack = (method->max_locals() + method->max_stack()) *
+                           Interpreter::stackElementWords;
+  return (overhead_size + method_stack + stub_code);
+}
+
+// asm based interpreter deoptimization helpers
+int AbstractInterpreter::size_activation(int max_stack,
+                                         int temps,
+                                         int extra_args,
+                                         int monitors,
+                                         int callee_params,
+                                         int callee_locals,
+                                         bool is_top_frame) {
+  // Note: This calculation must exactly parallel the frame setup
+  // in TemplateInterpreterGenerator::generate_method_entry.
+
+  // fixed size of an interpreter frame:
+  int overhead = frame::sender_sp_offset -
+                 frame::get_interpreter_frame_initial_sp_offset();
+  // Our locals were accounted for by the caller (or last_frame_adjust
+  // on the transistion) Since the callee parameters already account
+  // for the callee's params we only need to account for the extra
+  // locals.
+  int size = overhead +
+         (callee_locals - callee_params)*Interpreter::stackElementWords +
+         monitors * frame::interpreter_frame_monitor_size() +
+         temps* Interpreter::stackElementWords + extra_args;
+
+  // On AArch32 we always keep the stack pointer 16-aligned, so we
+  // must round up here.
+  size = align_up(size, 2);
+
+  return size;
+}
+
+void AbstractInterpreter::layout_activation(Method* method,
+                                            int tempcount,
+                                            int popframe_extra_args,
+                                            int moncount,
+                                            int caller_actual_parameters,
+                                            int callee_param_count,
+                                            int callee_locals,
+                                            frame* caller,
+                                            frame* interpreter_frame,
+                                            bool is_top_frame,
+                                            bool is_bottom_frame) {
+  // The frame interpreter_frame is guaranteed to be the right size,
+  // as determined by a previous call to the size_activation() method.
+  // It is also guaranteed to be walkable even though it is in a
+  // skeletal state
+
+  int max_locals = method->max_locals() * Interpreter::stackElementWords;
+  int extra_locals = (method->max_locals() - method->size_of_parameters()) *
+    Interpreter::stackElementWords;
+
+#ifdef ASSERT
+  assert(caller->sp() == interpreter_frame->sender_sp(), "Frame not properly walkable");
+#endif
+
+  interpreter_frame->interpreter_frame_set_method(method);
+  // NOTE the difference in using sender_sp and
+  // interpreter_frame_sender_sp interpreter_frame_sender_sp is
+  // the original sp of the caller (the unextended_sp) and
+  // sender_sp is fp+8/16 (32bit/64bit) XXX
+  intptr_t* locals = interpreter_frame->sender_sp() + max_locals - 1;
+
+#ifdef ASSERT
+  if (caller->is_interpreted_frame()) {
+    assert(locals < caller->fp() + frame::get_interpreter_frame_initial_sp_offset(), "bad placement");
+  }
+#endif
+
+  interpreter_frame->interpreter_frame_set_locals(locals);
+  BasicObjectLock* montop = interpreter_frame->interpreter_frame_monitor_begin();
+  BasicObjectLock* monbot = montop - moncount;
+  interpreter_frame->interpreter_frame_set_monitor_end(monbot);
+
+  // Set last_sp
+  intptr_t*  last_sp = (intptr_t*) monbot -
+    tempcount*Interpreter::stackElementWords -
+    popframe_extra_args;
+  interpreter_frame->interpreter_frame_set_last_sp(last_sp);
+
+  // All frames but the initial (oldest) interpreter frame we fill in have
+  // a value for sender_sp that allows walking the stack but isn't
+  // truly correct. Correct the value here.
+  if (extra_locals != 0 &&
+      interpreter_frame->sender_sp() ==
+      interpreter_frame->interpreter_frame_sender_sp()) {
+    interpreter_frame->set_interpreter_frame_sender_sp(caller->sp() +
+                                                       extra_locals);
+  }
+  *interpreter_frame->interpreter_frame_cache_addr() =
+    method->constants()->cache();
+  *interpreter_frame->interpreter_frame_mirror_addr() =
+    method->method_holder()->java_mirror();
+}
--- /dev/null	2018-09-25 19:24:10.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/assembler_aarch32.cpp	2018-09-25 19:24:09.000000000 +0300
@@ -0,0 +1,2149 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * reserved.  DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE
+ * HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <stdio.h>
+#include <sys/types.h>
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "compiler/disassembler.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "register_aarch32.hpp"
+#include "vm_version_aarch32.hpp"
+
+extern "C" void entry(CodeBuffer *cb);
+
+#define __ _masm.
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#endif
+
+#define BIND(label) bind(label); __ BLOCK_COMMENT(#label ":")
+
+void entry(CodeBuffer *cb) {
+
+  // {
+  //   for (int i = 0; i < 256; i+=16)
+  //     {
+  //    printf("\"%20.20g\", ", unpack(i));
+  //    printf("\"%20.20g\", ", unpack(i+1));
+  //     }
+  //   printf("\n");
+  // }
+
+#if defined(ASSERT) && !defined (__SOFTFP__)
+  Assembler _masm(cb);
+  address entry = __ pc();
+
+  // Smoke test for assembler
+
+  // we're checking the code generation, not applicability of the code to the actual target
+  // so temporarily override the detected cpu to allow emission of all instructions
+  const ProcessorFeatures detected_features = VM_Version::features();
+  VM_Version::features(FT_ALL);
+
+// BEGIN  Generated code -- do not edit
+// Generated by aarch32-asmtest.py
+    Label back, forth, near, near_post, near_flt, near_post_flt;
+    __ bind(back);
+
+// ThreeRegSft
+    __ add(r8, r2, r11, ::lsr(10));                    //   add r8, r2, r11, lsr #10
+    __ adds(r1, r3, r7, ::asr(1), Assembler::EQ);      //   addEQs r1, r3, r7, asr #1
+    __ eor(r0, r9, r4, ::lsl(5));                      //   eor r0, r9, r4, lsl #5
+    __ eors(r9, r2, r6, ::rrx(), Assembler::GT);       //   eorGTs r9, r2, r6, rrx
+    __ sub(r0, r12, lr, ::lsr(0), Assembler::GT);      //   subGT r0, r12, lr, lsr #0
+    __ subs(r8, r2, r4, ::ror(6), Assembler::EQ);      //   subEQs r8, r2, r4, ror #6
+    __ rsb(r8, r9, sp, ::lsl(3));                      //   rsb r8, r9, sp, lsl #3
+    __ rsbs(r8, r0, r4, ::ror(16), Assembler::VS);     //   rsbVSs r8, r0, r4, ror #16
+    __ add(r9, r5, r1, ::lsr(15), Assembler::LE);      //   addLE r9, r5, r1, lsr #15
+    __ adds(r1, sp, r6, ::asr(5));                     //   adds r1, sp, r6, asr #5
+    __ adc(r11, sp, r7, ::asr(1), Assembler::GT);      //   adcGT r11, sp, r7, asr #1
+    __ adcs(r0, r8, r9, ::lsr(6));                     //   adcs r0, r8, r9, lsr #6
+    __ sbc(r9, r3, r6, ::ror(5));                      //   sbc r9, r3, r6, ror #5
+    __ sbcs(r1, sp, r5, ::asr(16), Assembler::HI);     //   sbcHIs r1, sp, r5, asr #16
+    __ rsc(r8, r2, r6, ::lsl(9), Assembler::CC);       //   rscCC r8, r2, r6, lsl #9
+    __ rscs(r10, r4, sp, ::ror(14));                   //   rscs r10, r4, sp, ror #14
+    __ orr(r11, sp, r5, ::lsl(15), Assembler::NE);     //   orrNE r11, sp, r5, lsl #15
+    __ orrs(r9, r10, r4, ::ror(14));                   //   orrs r9, r10, r4, ror #14
+    __ bic(r9, sp, r5, ::ror(1));                      //   bic r9, sp, r5, ror #1
+    __ bics(r0, r2, r7, ::asr(10));                    //   bics r0, r2, r7, asr #10
+
+// ThreeRegRSR
+    __ add(sp, r6, r7, ::ror(r7));                     //   add sp, r6, r7, ror r7
+    __ adds(r4, r12, r6, ::ror(r7), Assembler::HI);    //   addHIs r4, r12, r6, ror r7
+    __ eor(r5, r6, r7, ::asr(r12), Assembler::LS);     //   eorLS r5, r6, r7, asr r12
+    __ eors(r8, r5, sp, ::lsl(r4), Assembler::AL);     //   eorALs r8, r5, sp, lsl r4
+    __ sub(r2, r12, r5, ::asr(r0));                    //   sub r2, r12, r5, asr r0
+    __ subs(r9, r3, r7, ::lsl(r12), Assembler::HS);    //   subHSs r9, r3, r7, lsl r12
+    __ rsb(r9, r12, r4, ::lsl(r6), Assembler::GT);     //   rsbGT r9, r12, r4, lsl r6
+    __ rsbs(r8, r2, r12, ::lsl(r1));                   //   rsbs r8, r2, r12, lsl r1
+    __ add(r4, r12, sp, ::lsl(sp));                    //   add r4, r12, sp, lsl sp
+    __ adds(r8, r11, r6, ::ror(sp));                   //   adds r8, r11, r6, ror sp
+    __ adc(r0, r2, r5, ::lsl(r4), Assembler::NE);      //   adcNE r0, r2, r5, lsl r4
+    __ adcs(r11, lr, r6, ::asr(r2));                   //   adcs r11, lr, r6, asr r2
+    __ sbc(r8, r10, lr, ::asr(r3), Assembler::HI);     //   sbcHI r8, r10, lr, asr r3
+    __ sbcs(r1, r12, r5, ::lsl(r6));                   //   sbcs r1, r12, r5, lsl r6
+    __ rsc(r4, r5, lr, ::ror(r10), Assembler::VS);     //   rscVS r4, r5, lr, ror r10
+    __ rscs(r1, r12, sp, ::lsl(r8));                   //   rscs r1, r12, sp, lsl r8
+    __ orr(r8, r1, r6, ::ror(r0), Assembler::VS);      //   orrVS r8, r1, r6, ror r0
+    __ orrs(r11, sp, r7, ::ror(r5));                   //   orrs r11, sp, r7, ror r5
+    __ bic(r4, lr, r6, ::lsl(r2), Assembler::AL);      //   bicAL r4, lr, r6, lsl r2
+    __ bics(r10, r11, sp, ::lsl(r3));                  //   bics r10, r11, sp, lsl r3
+
+// TwoRegImm
+    __ add(r8, sp, (unsigned)268435462U, Assembler::HI); // addHI r8, sp, #268435462
+    __ adds(sp, lr, (unsigned)162529280U);             //   adds sp, lr, #162529280
+    __ eor(lr, r6, (unsigned)8192000U);                //   eor lr, r6, #8192000
+    __ eors(r2, r3, (unsigned)292U);                   //   eors r2, r3, #292
+    __ sub(r4, sp, (unsigned)227540992U);              //   sub r4, sp, #227540992
+    __ subs(r1, lr, (unsigned)33554432U, Assembler::LT); // subLTs r1, lr, #33554432
+    __ rsb(r0, r5, (unsigned)2483027968U);             //   rsb r0, r5, #2483027968
+    __ rsbs(r8, r4, (unsigned)3080192U, Assembler::LO); //  rsbLOs r8, r4, #3080192
+    __ add(r9, r4, (unsigned)2147483648U, Assembler::LT); //    addLT r9, r4, #2147483648
+    __ adds(r8, r4, (unsigned)32768U, Assembler::AL);  //   addALs r8, r4, #32768
+    __ adc(r10, lr, (unsigned)10752U, Assembler::CS);  //   adcCS r10, lr, #10752
+    __ adcs(r10, r6, (unsigned)774144U);               //   adcs r10, r6, #774144
+    __ sbc(r2, r12, (unsigned)637534208U);             //   sbc r2, r12, #637534208
+    __ sbcs(r8, r10, (unsigned)692060160U);            //   sbcs r8, r10, #692060160
+    __ rsc(sp, r6, (unsigned)7405568U);                //   rsc sp, r6, #7405568
+    __ rscs(r10, r11, (unsigned)244318208U, Assembler::NE); //  rscNEs r10, r11, #244318208
+    __ orr(r3, r7, (unsigned)66846720U, Assembler::VS); //  orrVS r3, r7, #66846720
+    __ orrs(r2, r5, (unsigned)1327104U, Assembler::EQ); //  orrEQs r2, r5, #1327104
+    __ bic(r8, r1, (unsigned)3744U, Assembler::VS);    //   bicVS r8, r1, #3744
+    __ bics(r0, r2, (unsigned)2684354560U, Assembler::LO); //   bicLOs r0, r2, #2684354560
+
+// TwoRegSft
+    __ tst(r8, sp, ::lsl(5));                          //   tst r8, sp, lsl #5
+    __ teq(r6, r7, ::lsr(3));                          //   teq r6, r7, lsr #3
+    __ cmp(r12, r4, ::ror(2));                         //   cmp r12, r4, ror #2
+    __ cmn(r5, r7, ::lsl(16), Assembler::LT);          //   cmnLT r5, r7, lsl #16
+
+// TwoRegRSR
+    __ tst(r2, lr, ::lsr(r7));                         //   tst r2, lr, lsr r7
+    __ teq(r0, r2, ::ror(r5), Assembler::CC);          //   teqCC r0, r2, ror r5
+    __ cmp(lr, r7, ::lsr(r11), Assembler::LS);         //   cmpLS lr, r7, lsr r11
+    __ cmn(r10, r7, ::lsl(r11), Assembler::VS);        //   cmnVS r10, r7, lsl r11
+
+// OneRegImm
+    __ tst(r2, (unsigned)557842432U);                  //   tst r2, #557842432
+    __ teq(lr, (unsigned)7077888U, Assembler::MI);     //   teqMI lr, #7077888
+    __ cmp(r5, (unsigned)939524096U);                  //   cmp r5, #939524096
+    __ cmn(r7, (unsigned)2147483650U, Assembler::LO);  //   cmnLO r7, #2147483650
+
+// Shift op
+    __ lsl(r0, r4, (unsigned)23U);                     //   lsl r0, r4, #23
+    __ lsls(r1, r4, (unsigned)9U);                     //   lsls r1, r4, #9
+    __ lsr(r0, r10, (unsigned)3U);                     //   lsr r0, r10, #3
+    __ lsrs(r0, r10, (unsigned)20U);                   //   lsrs r0, r10, #20
+    __ asr(r1, r9, (unsigned)11U);                     //   asr r1, r9, #11
+    __ asrs(r2, r11, (unsigned)10U, Assembler::VS);    //   asrVSs r2, r11, #10
+
+// shift op
+    __ ror(r8, r2, (unsigned)31U, Assembler::CC);      //   rorCC r8, r2, #31
+    __ rors(r9, r12, (unsigned)8U);                    //   rors r9, r12, #8
+
+// ThreeRegNon
+    __ ror(r8, lr, r7);                                //   ror r8, lr, r7
+    __ rors(r12, r3, r4);                              //   rors r12, r3, r4
+    __ lsl(r12, sp, lr, Assembler::GT);                //   lslGT r12, sp, lr
+    __ lsls(r12, sp, r6, Assembler::AL);               //   lslALs r12, sp, r6
+    __ lsr(r0, r1, r9, Assembler::GT);                 //   lsrGT r0, r1, r9
+    __ lsrs(r11, r3, r12, Assembler::GT);              //   lsrGTs r11, r3, r12
+    __ asr(r2, r12, r6, Assembler::LE);                //   asrLE r2, r12, r6
+    __ asrs(r1, r10, r6, Assembler::LT);               //   asrLTs r1, r10, r6
+
+// TwoRegNon
+    __ mov(r10, r3);                                   //   mov r10, r3
+    __ movs(r0, r9);                                   //   movs r0, r9
+
+// OneRegImm
+    __ mov_i(r3, (unsigned)656U, Assembler::VC);         // movVC r3, #656
+    __ movs_i(r4, (unsigned)2064384U);                   // movs r4, #2064384
+
+// TwoRegSft
+    __ mov(r12, r6, ::lsr(3));                         //   mov r12, r6, lsr #3
+    __ movs(r5, sp, ::asr(10), Assembler::VC);         //   movVCs r5, sp, asr #10
+
+// TwoRegRSR
+    __ mov(r1, lr, ::ror(r3));                         //   mov r1, lr, ror r3
+    __ movs(r8, r12, ::ror(r9), Assembler::EQ);        //   movEQs r8, r12, ror r9
+
+// OneRegImm16
+    __ movw_i(r11, (unsigned)53041U, Assembler::LO);     // movwLO r11, #53041
+    __ movt_i(r9, (unsigned)11255U, Assembler::LO);      // movtLO r9, #11255
+
+// ThreeRegNon
+    __ mul(r1, sp, r5, Assembler::LE);                 //   mulLE r1, sp, r5
+    __ muls(r0, r10, r11);                             //   muls r0, r10, r11
+
+// FourRegNon
+    __ mla(r0, r3, r12, r7);                           //   mla r0, r3, r12, r7
+    __ mlas(r8, r11, r3, r6, Assembler::EQ);           //   mlaEQs r8, r11, r3, r6
+    __ umull(lr, r4, r5, r6);                          //   umull lr, r4, r5, r6
+    __ umulls(r0, r4, r6, r7);                         //   umulls r0, r4, r6, r7
+    __ umlal(r8, r0, r11, lr);                         //   umlal r8, r0, r11, lr
+    __ umlals(r11, r4, lr, r7);                        //   umlals r11, r4, lr, r7
+    __ smull(r1, r5, r6, r7, Assembler::HS);           //   smullHS r1, r5, r6, r7
+    __ smulls(r0, r11, r12, r5, Assembler::MI);        //   smullMIs r0, r11, r12, r5
+
+// FourRegNon
+    __ umaal(r8, r9, r2, r5);                          //   umaal r8, r9, r2, r5
+    __ mls(r0, r4, sp, lr, Assembler::EQ);             //   mlsEQ r0, r4, sp, lr
+
+// ThreeRegNon
+    __ qadd(r9, r4, sp, Assembler::PL);                //   qaddPL r9, r4, sp
+    __ qsub(r0, r12, r5, Assembler::MI);               //   qsubMI r0, r12, r5
+    __ qdadd(r3, r5, r7);                              //   qdadd r3, r5, r7
+    __ qdsub(r9, r2, r4);                              //   qdsub r9, r2, r4
+
+// FourRegNon
+    __ smlabb(r1, r12, r5, r6);                        //   smlabb r1, r12, r5, r6
+    __ smlabt(r0, r10, r12, r6);                       //   smlabt r0, r10, r12, r6
+    __ smlatb(r8, r1, r3, lr);                         //   smlatb r8, r1, r3, lr
+    __ smlatt(r1, sp, r6, r7);                         //   smlatt r1, sp, r6, r7
+    __ smlawb(r0, r3, r4, r6);                         //   smlawb r0, r3, r4, r6
+    __ smlawt(r11, r4, lr, r7);                        //   smlawt r11, r4, lr, r7
+    __ smlalbb(r0, r10, r6, r7);                       //   smlalbb r0, r10, r6, r7
+    __ smlalbt(r3, r11, r4, lr, Assembler::LS);        //   smlalbtLS r3, r11, r4, lr
+    __ smlaltb(r8, r11, r3, r12);                      //   smlaltb r8, r11, r3, r12
+    __ smlaltt(r8, r1, r3, r5);                        //   smlaltt r8, r1, r3, r5
+
+// ThreeRegNon
+    __ smulwb(r2, r12, sp, Assembler::HS);             //   smulwbHS r2, r12, sp
+    __ smulwt(r8, r12, r6);                            //   smulwt r8, r12, r6
+    __ smulbb(r2, r6, lr, Assembler::GE);              //   smulbbGE r2, r6, lr
+    __ smulbt(r8, r12, r7);                            //   smulbt r8, r12, r7
+    __ smultb(r10, r3, lr, Assembler::EQ);             //   smultbEQ r10, r3, lr
+    __ smultt(r0, r3, sp);                             //   smultt r0, r3, sp
+
+// MemoryOp
+    __ ldr(r10, Address(r7, r9, lsl(), Address::ADD, Address::post)); //    ldr r10, [r7], r9
+    __ ldrb(r0, Address(r9, 196));                     //   ldrb r0, [r9, #196]
+    __ ldrh(lr, Address(r4, r6, lsl(), Address::ADD, Address::pre)); // ldrh lr, [r4, r6]!
+    __ ldrsb(r6, Address(__ pre(r9, 232)));            //   ldrsb r6, [r9, #232]!
+    __ ldrsh(r2, Address(r1, r1, lsl(), Address::ADD, Address::post)); //   ldrsh r2, [r1], r1
+    __ str(r0, Address(r9, r4, lsl(), Address::ADD, Address::post)); // str r0, [r9], r4
+    __ strb(r3, Address(__ pre(r5, 92)));              //   strb r3, [r5, #92]!
+    __ strh(r2, Address(r8, 160));                     //   strh r2, [r8, #160]
+
+// MemoryOp
+    __ ldr(r8, Address(r12, r8, lsl(), Address::ADD, Address::off)); // ldr r8, [r12, r8]
+    __ ldrb(r11, Address(__ post(r10, 16)));           //   ldrb r11, [r10], #16
+    __ ldrh(r11, Address(r10, r6, lsl(), Address::ADD, Address::off)); //   ldrh r11, [r10, r6]
+    __ ldrsb(r5, Address(r11, r10, lsl(), Address::ADD, Address::pre)); //  ldrsb r5, [r11, r10]!
+    __ ldrsh(r6, Address(r3, r7, lsl(), Address::ADD, Address::off)); //    ldrsh r6, [r3, r7]
+    __ str(r7, Address(sp, r5, lsl(), Address::ADD, Address::pre)); //  str r7, [sp, r5]!
+    __ strb(r2, Address(r10));                         //   strb r2, [r10]
+    __ strh(r6, Address(r4, r3, lsl(), Address::ADD, Address::post)); //    strh r6, [r4], r3
+
+// MemoryOp
+    __ ldr(r10, Address(r12));                         //   ldr r10, [r12]
+    __ ldrb(r4, Address(__ post(r11, 132)));           //   ldrb r4, [r11], #132
+    __ ldrh(r9, Address(r9, r12, lsl(), Address::ADD, Address::post)); //   ldrh r9, [r9], r12
+    __ ldrsb(r9, Address(__ post(r3, 148)));           //   ldrsb r9, [r3], #148
+    __ ldrsh(r11, Address(__ pre(r2, 148)));           //   ldrsh r11, [r2, #148]!
+    __ str(r11, Address(sp, r11, lsl(), Address::ADD, Address::off)); //    str r11, [sp, r11]
+    __ strb(r1, Address(sp, r10, lsl(), Address::ADD, Address::off)); //    strb r1, [sp, r10]
+    __ strh(r10, Address(lr, r9, lsl(), Address::ADD, Address::post)); //   strh r10, [lr], r9
+
+// MemoryOp
+    __ ldr(r6, Address(r3, r4, lsl(), Address::ADD, Address::pre)); //  ldr r6, [r3, r4]!
+    __ ldrb(r4, Address(r6, sp, lsl(), Address::ADD, Address::pre)); // ldrb r4, [r6, sp]!
+    __ ldrh(r6, Address(r7, r10, lsl(), Address::ADD, Address::post)); //   ldrh r6, [r7], r10
+    __ ldrsb(r0, Address(r6, r11, lsl(), Address::ADD, Address::pre)); //   ldrsb r0, [r6, r11]!
+    __ ldrsh(r10, Address(r6, sp, lsl(), Address::ADD, Address::post)); //  ldrsh r10, [r6], sp
+    __ str(r7, Address(r3, r12, lsl(), Address::ADD, Address::off)); // str r7, [r3, r12]
+    __ strb(r3, Address(r8, r1, lsl(), Address::ADD, Address::pre)); // strb r3, [r8, r1]!
+    __ strh(r4, Address(r12, 64));                     //   strh r4, [r12, #64]
+
+    __ bind(near);
+
+// LitMemoryOp
+    __ ldr(r1, near);                                  //   ldr r1, near
+    __ ldrb(r7, __ pc());                              //   ldrb r7, .
+    __ ldrh(r2, near);                                 //   ldrh r2, near
+    __ ldrsb(r10, __ pc());                            //   ldrsb r10, .
+    __ ldrsh(lr, near_post);                           //   ldrsh lr, near_post
+
+// LitMemoryOp
+    __ ldr(r2, __ pc());                               //   ldr r2, .
+    __ ldrb(r3, __ pc());                              //   ldrb r3, .
+    __ ldrh(r7, near_post);                            //   ldrh r7, near_post
+    __ ldrsb(sp, __ pc());                             //   ldrsb sp, .
+    __ ldrsh(r10, near);                               //   ldrsh r10, near
+
+// LitMemoryOp
+    __ ldr(r5, __ pc());                               //   ldr r5, .
+    __ ldrb(lr, near_post);                            //   ldrb lr, near_post
+    __ ldrh(r5, near_post);                            //   ldrh r5, near_post
+    __ ldrsb(r6, near);                                //   ldrsb r6, near
+    __ ldrsh(r11, near);                               //   ldrsh r11, near
+
+// LitMemoryOp
+    __ ldr(r7, near_post);                             //   ldr r7, near_post
+    __ ldrb(r5, near_post);                            //   ldrb r5, near_post
+    __ ldrh(r10, near);                                //   ldrh r10, near
+    __ ldrsb(r6, near_post);                           //   ldrsb r6, near_post
+    __ ldrsh(r9, __ pc());                             //   ldrsh r9, .
+
+    __ bind(near_post);
+
+// MemoryRegRegSftOp
+    __ ldr(r0, Address(r0, r10, ::ror(6), Address::ADD, Address::post)); // ldr r0, [r0], r10, ror #6
+    __ ldrb(r3, Address(r8, lr, ::lsl(9), Address::ADD, Address::off)); //  ldrb r3, [r8, lr, lsl #9]
+    __ str(r5, Address(sp, r3, ::lsl(15), Address::ADD, Address::off)); //  str r5, [sp, r3, lsl #15]
+    __ strb(r9, Address(r9, r5, ::asr(2), Address::ADD, Address::post)); // strb r9, [r9], r5, asr #2
+
+// MemoryRegRegSftOp
+    __ ldr(r5, Address(r4, r0, ::ror(6), Address::ADD, Address::off)); //   ldr r5, [r4, r0, ror #6]
+    __ ldrb(lr, Address(r0, r4, ::lsr(9), Address::ADD, Address::off)); //  ldrb lr, [r0, r4, lsr #9]
+    __ str(r5, Address(r12, r12, ::asr(5), Address::ADD, Address::post)); //    str r5, [r12], r12, asr #5
+    __ strb(r3, Address(r1, r7, ::ror(12), Address::ADD, Address::pre)); // strb r3, [r1, r7, ror #12]!
+
+// MemoryRegRegSftOp
+    __ ldr(r6, Address(r2, r3, ::rrx(), Address::ADD, Address::pre)); //    ldr r6, [r2, r3, rrx]!
+    __ ldrb(r8, Address(lr, r2, ::asr(16), Address::ADD, Address::pre)); // ldrb r8, [lr, r2, asr #16]!
+    __ str(r6, Address(r3, r6, ::ror(7), Address::ADD, Address::pre)); //   str r6, [r3, r6, ror #7]!
+    __ strb(r3, Address(r8, r2, ::lsl(10), Address::ADD, Address::off)); // strb r3, [r8, r2, lsl #10]
+
+// MemoryRegRegSftOp
+    __ ldr(r11, Address(sp, lr, ::lsl(8), Address::ADD, Address::off)); //  ldr r11, [sp, lr, lsl #8]
+    __ ldrb(r10, Address(sp, r12, ::lsl(4), Address::ADD, Address::pre)); //    ldrb r10, [sp, r12, lsl #4]!
+    __ str(sp, Address(r9, r2, ::asr(2), Address::ADD, Address::off)); //   str sp, [r9, r2, asr #2]
+    __ strb(r7, Address(r11, lr, ::asr(14), Address::ADD, Address::pre)); //    strb r7, [r11, lr, asr #14]!
+
+// LdStOne
+    __ ldrex(r12, r11);                                //   ldrex r12, [r11]
+    __ ldrexb(r4, r12);                                //   ldrexb r4, [r12]
+    __ ldrexh(r11, r11);                               //   ldrexh r11, [r11]
+
+// LdStTwo
+    __ strex(r1, r7, lr);                              //   strex r1, r7, [lr]
+    __ strexb(r12, r6, r4);                            //   strexb r12, r6, [r4]
+    __ strexh(r4, r6, r7, Assembler::HS);              //   strexhHS r4, r6, [r7]
+
+// ThreeRegNon
+    __ sadd16(r3, r4, r7);                             //   sadd16 r3, r4, r7
+    __ sasx(r9, r10, r3, Assembler::AL);               //   sasxAL r9, r10, r3
+    __ ssax(r12, r5, r6);                              //   ssax r12, r5, r6
+    __ ssub16(r12, r5, lr);                            //   ssub16 r12, r5, lr
+    __ sadd8(r0, r10, r7);                             //   sadd8 r0, r10, r7
+    __ ssub8(r0, r8, r2, Assembler::VS);               //   ssub8VS r0, r8, r2
+    __ qadd16(r11, r4, r5, Assembler::PL);             //   qadd16PL r11, r4, r5
+    __ qasx(r11, r3, r12, Assembler::VS);              //   qasxVS r11, r3, r12
+    __ qsax(r0, r3, r5);                               //   qsax r0, r3, r5
+    __ ssub16(r10, r12, r5, Assembler::AL);            //   ssub16AL r10, r12, r5
+    __ qadd8(r10, r6, lr, Assembler::CC);              //   qadd8CC r10, r6, lr
+    __ qsub8(r10, r11, r7);                            //   qsub8 r10, r11, r7
+    __ shadd16(r9, r4, lr, Assembler::PL);             //   shadd16PL r9, r4, lr
+    __ shasx(r1, lr, r7);                              //   shasx r1, lr, r7
+    __ shsax(r9, r11, r5, Assembler::LO);              //   shsaxLO r9, r11, r5
+    __ shsub16(r3, r1, r11, Assembler::GE);            //   shsub16GE r3, r1, r11
+    __ shadd8(sp, r5, r7, Assembler::GT);              //   shadd8GT sp, r5, r7
+    __ shsub8(r1, r5, r7);                             //   shsub8 r1, r5, r7
+
+// ThreeRegNon
+    __ uadd16(r10, r4, r7);                            //   uadd16 r10, r4, r7
+    __ uasx(r1, r9, r7, Assembler::HS);                //   uasxHS r1, r9, r7
+    __ usax(r11, sp, r7);                              //   usax r11, sp, r7
+    __ usub16(r11, r4, lr);                            //   usub16 r11, r4, lr
+    __ uadd8(r2, sp, r7, Assembler::LO);               //   uadd8LO r2, sp, r7
+    __ usub8(r8, r10, lr, Assembler::GT);              //   usub8GT r8, r10, lr
+    __ uqadd16(r3, r12, sp);                           //   uqadd16 r3, r12, sp
+    __ uqasx(r4, sp, r6);                              //   uqasx r4, sp, r6
+    __ uqsax(r1, r10, lr);                             //   uqsax r1, r10, lr
+    __ uqsub16(r2, sp, lr, Assembler::LE);             //   uqsub16LE r2, sp, lr
+    __ uqadd8(r1, r12, r5);                            //   uqadd8 r1, r12, r5
+    __ uqsub8(r0, r4, sp, Assembler::GT);              //   uqsub8GT r0, r4, sp
+    __ uhadd16(r0, r10, r5, Assembler::HI);            //   uhadd16HI r0, r10, r5
+    __ uhasx(r11, r4, r7, Assembler::LE);              //   uhasxLE r11, r4, r7
+    __ uhsax(r1, lr, r9, Assembler::GE);               //   uhsaxGE r1, lr, r9
+    __ uhsub16(r2, r11, lr);                           //   uhsub16 r2, r11, lr
+    __ uhadd8(r9, r4, r5, Assembler::GE);              //   uhadd8GE r9, r4, r5
+    __ uhsub8(r2, sp, lr, Assembler::HI);              //   uhsub8HI r2, sp, lr
+
+// PKUPSATREV
+    __ sxtab16(r10, r3, r7, ::ror(16));                //   sxtab16 r10, r3, r7, ROR #16
+    __ sxtab(r9, r5, r7, ::ror(24), Assembler::CS);    //   sxtabCS r9, r5, r7, ROR #24
+    __ sxtah(r3, r5, r7, ::ror(8));                    //   sxtah r3, r5, r7, ROR #8
+    __ uxtab16(r8, r4, r6, ::ror(8), Assembler::AL);   //   uxtab16AL r8, r4, r6, ROR #8
+    __ uxtab(r0, r11, sp, ::rrx(), Assembler::EQ);     //   uxtabEQ r0, r11, sp, ROR #0
+    __ uxtah(r9, r12, r5, ::rrx());                    //   uxtah r9, r12, r5, ROR #0
+
+// PKUPSATREV
+    __ sxtb16(r3, r11, ::ror(16), Assembler::GE);      //   sxtb16GE r3, r11, ROR #16
+    __ sxtb(r2, r6, ::rrx(), Assembler::HI);           //   sxtbHI r2, r6, ROR #0
+    __ sxth(r3, sp, ::ror(24), Assembler::GT);         //   sxthGT r3, sp, ROR #24
+    __ uxtb16(r12, r5, ::ror(16));                     //   uxtb16 r12, r5, ROR #16
+    __ uxtb(r12, r5, ::ror(16));                       //   uxtb r12, r5, ROR #16
+    __ uxth(r8, r5, ::ror(16));                        //   uxth r8, r5, ROR #16
+
+// TwoRegNon
+    __ rev(r10, r4, Assembler::EQ);                    //   revEQ r10, r4
+    __ rev16(r8, r12, Assembler::GE);                  //   rev16GE r8, r12
+    __ rbit(lr, r7);                                   //   rbit lr, r7
+    __ revsh(sp, r7, Assembler::GT);                   //   revshGT sp, r7
+
+// ThreeRegNon
+    __ sdiv(r9, sp, lr);                               //   sdiv r9, sp, lr
+    __ udiv(r2, r12, r6);                              //   udiv r2, r12, r6
+
+// TwoRegTwoImm
+    __ sbfx(r0, r1, (unsigned)20U, (unsigned)3U, Assembler::MI); // sbfxMI r0, r1, #20, #3
+    __ ubfx(r9, r2, (unsigned)16U, (unsigned)15U);     //   ubfx r9, r2, #16, #15
+    __ bfi(r1, r11, (unsigned)27U, (unsigned)3U, Assembler::HI); // bfiHI r1, r11, #27, #3
+
+// TwoRegTwoImm
+    __ bfc(r3, (unsigned)7U, (unsigned)10U);           //   bfc r3, #7, #10
+
+// MultipleMemOp
+    __ stmda(r6, 3435U, false);                        //   stmda r6, {r0, r1, r3, r5, r6, r8, r10, r11}
+    __ stmed(r4, 14559U, false);                       //   stmed r4, {r0, r1, r2, r3, r4, r6, r7, r11, r12, sp}
+    __ ldmda(r0, 57812U, false);                       //   ldmda r0, {r2, r4, r6, r7, r8, sp, lr, pc}
+    __ ldmfa(r12, 39027U, true);                       //   ldmfa r12!, {r0, r1, r4, r5, r6, r11, r12, pc}
+    __ stmia(r9, 12733U, true);                        //   stmia r9!, {r0, r2, r3, r4, r5, r7, r8, r12, sp}
+    __ stmea(r11, 21955U, false);                      //   stmea r11, {r0, r1, r6, r7, r8, r10, r12, lr}
+    __ ldmia(r12, 48418U, true);                       //   ldmia r12!, {r1, r5, r8, r10, r11, r12, sp, pc}
+    __ ldmfd(sp, 41226U, true);                        //   ldmfd sp!, {r1, r3, r8, sp, pc}
+    __ stmdb(r11, 8729U, true);                        //   stmdb r11!, {r0, r3, r4, r9, sp}
+    __ stmfd(r9, 36309U, true);                        //   stmfd r9!, {r0, r2, r4, r6, r7, r8, r10, r11, pc}
+    __ ldmdb(r5, 24667U, true);                        //   ldmdb r5!, {r0, r1, r3, r4, r6, sp, lr}
+    __ ldmea(r1, 37287U, false);                       //   ldmea r1, {r0, r1, r2, r5, r7, r8, r12, pc}
+    __ stmib(r11, 28266U, true);                       //   stmib r11!, {r1, r3, r5, r6, r9, r10, r11, sp, lr}
+    __ stmfa(r11, 17671U, false);                      //   stmfa r11, {r0, r1, r2, r8, r10, lr}
+    __ ldmib(r0, 21452U, true);                        //   ldmib r0!, {r2, r3, r6, r7, r8, r9, r12, lr}
+    __ ldmed(r1, 11751U, false);                       //   ldmed r1, {r0, r1, r2, r5, r6, r7, r8, r10, r11, sp}
+
+// BranchLabel
+    __ b(forth, Assembler::CS);                        //   bCS forth
+    __ bl(__ pc(), Assembler::MI);                     //   blMI .
+
+// OneRegNon
+    __ b(r0, Assembler::VS);                           //   bxVS r0
+    __ bl(r3);                                         //   blx r3
+
+// BranchLabel
+    __ b(__ pc(), Assembler::AL);                      //   bAL .
+    __ bl(__ pc());                                    //   bl .
+
+// OneRegNon
+    __ b(r0, Assembler::VS);                           //   bxVS r0
+    __ bl(r5);                                         //   blx r5
+
+// BranchLabel
+    __ b(forth, Assembler::LE);                        //   bLE forth
+    __ bl(__ pc(), Assembler::MI);                     //   blMI .
+
+// OneRegNon
+    __ b(r9, Assembler::NE);                           //   bxNE r9
+    __ bl(r12);                                        //   blx r12
+
+// BranchLabel
+    __ b(back);                                        //   b back
+    __ bl(__ pc(), Assembler::HI);                     //   blHI .
+
+// OneRegNon
+    __ b(r1, Assembler::VC);                           //   bxVC r1
+    __ bl(r7, Assembler::GT);                          //   blxGT r7
+
+// BranchLabel
+    __ b(back, Assembler::GE);                         //   bGE back
+    __ bl(__ pc(), Assembler::HI);                     //   blHI .
+
+// OneRegNon
+    __ b(r12);                                         //   bx r12
+    __ bl(r7, Assembler::CC);                          //   blxCC r7
+
+// BranchLabel
+    __ b(__ pc());                                     //   b .
+    __ bl(back, Assembler::GT);                        //   blGT back
+
+// OneRegNon
+    __ b(r1, Assembler::GE);                           //   bxGE r1
+    __ bl(r0);                                         //   blx r0
+
+// BranchLabel
+    __ b(__ pc());                                     //   b .
+    __ bl(forth);                                      //   bl forth
+
+// OneRegNon
+    __ b(lr, Assembler::GT);                           //   bxGT lr
+    __ bl(r11, Assembler::NE);                         //   blxNE r11
+
+// BranchLabel
+    __ b(__ pc(), Assembler::CS);                      //   bCS .
+    __ bl(__ pc());                                    //   bl .
+
+// OneRegNon
+    __ b(r10, Assembler::HS);                          //   bxHS r10
+    __ bl(r4);                                         //   blx r4
+
+// BranchLabel
+    __ b(back, Assembler::AL);                         //   bAL back
+    __ bl(__ pc());                                    //   bl .
+
+// OneRegNon
+    __ b(r12, Assembler::LO);                          //   bxLO r12
+    __ bl(r8);                                         //   blx r8
+
+// BranchLabel
+    __ b(forth);                                       //   b forth
+    __ bl(__ pc());                                    //   bl .
+
+// OneRegNon
+    __ b(r10);                                         //   bx r10
+    __ bl(r1);                                         //   blx r1
+
+// ThreeFltNon
+    __ vmla_f32(f4, f8, f12, Assembler::MI);            //  vmlaMI.f32 s4, s8, s12
+    __ vmls_f32(f4, f10, f10);                           // vmls.f32 s4, s10, s10
+    __ vnmla_f32(f2, f10, f12);                          // vnmla.f32 s2, s10, s12
+    __ vnmls_f32(f8, f6, f8, Assembler::LT);           //   vnmlsLT.f32 s8, s6, s8
+    __ vnmul_f32(f6, f12, f14, Assembler::MI);           // vnmulMI.f32 s6, s12, s14
+    __ vadd_f32(f0, f2, f0);                           //   vadd.f32 s0, s2, s0
+    __ vsub_f32(f2, f4, f10, Assembler::AL);            //  vsubAL.f32 s2, s4, s10
+    __ vdiv_f32(f0, f2, f12, Assembler::CS);            //  vdivCS.f32 s0, s2, s12
+
+// ThreeFltNon
+    __ vmla_f64(d0, d3, d6);                           //   vmla.f64 d0, d3, d6
+    __ vmls_f64(d0, d1, d5);                           //   vmls.f64 d0, d1, d5
+    __ vnmla_f64(d1, d4, d6);                          //   vnmla.f64 d1, d4, d6
+    __ vnmls_f64(d0, d1, d1, Assembler::NE);           //   vnmlsNE.f64 d0, d1, d1
+    __ vnmul_f64(d3, d5, d5, Assembler::NE);           //   vnmulNE.f64 d3, d5, d5
+    __ vadd_f64(d0, d2, d4, Assembler::LO);            //   vaddLO.f64 d0, d2, d4
+    __ vsub_f64(d1, d2, d4);                           //   vsub.f64 d1, d2, d4
+    __ vdiv_f64(d0, d1, d5, Assembler::MI);            //   vdivMI.f64 d0, d1, d5
+
+// TwoFltNon
+    __ vabs_f32(f6, f6);                               //   vabs.f32 s6, s6
+    __ vneg_f32(f6, f8, Assembler::PL);                //   vnegPL.f32 s6, s8
+    __ vsqrt_f32(f0, f8);                              //   vsqrt.f32 s0, s8
+
+// TwoFltNon
+    __ vabs_f64(d0, d4);                               //   vabs.f64 d0, d4
+    __ vneg_f64(d1, d4);                               //   vneg.f64 d1, d4
+    __ vsqrt_f64(d0, d1);                              //   vsqrt.f64 d0, d1
+
+// vmov_f32
+    __ vmov_f32(f0, lr, Assembler::PL);                //   vmovPL.f32 s0, lr
+
+// vmov_f32
+    __ vmov_f32(r11, f8);                              //   vmov.f32 r11, s8
+
+// vmov_f64
+    __ vmov_f64(d1, r11, lr, Assembler::LT);           //   vmovLT.f64 d1, r11, lr
+
+// vmov_f64
+    __ vmov_f64(r7, r5, d5);                           //   vmov.f64 r7, r5, d5
+
+// vmov_f32
+    __ vmov_f32(f8, f12);                               //  vmov.f32 s8, s12
+
+// vmov_f64
+    __ vmov_f64(d1, d2, Assembler::HI);                //   vmovHI.f64 d1, d2
+
+// vmov_f32
+    __ vmov_f32(f4, 1.0f, Assembler::VS);              //   vmovVS.f32 s4, #1.0
+
+// vmov_f64
+    __ vmov_f64(d2, 1.0);                              //   vmov.f64 d2, #1.0
+
+// vmov_f32
+    __ vmov_f32(f6, 2.0f);                             //   vmov.f32 s6, #2.0
+
+// vmov_f64
+    __ vmov_f64(d1, 2.0);                              //   vmov.f64 d1, #2.0
+
+// vector memory
+    __ vldr_f32(f4, Address(r5, 116));                 //   vldr.f32 s4, [r5, #116]
+    __ vstr_f32(f2, Address(r1, 56), Assembler::CC);   //   vstrCC.f32 s2, [r1, #56]
+
+// vector memory
+    __ vldr_f64(d7, Address(r5, 16), Assembler::NE);   //   vldrNE.f64 d7, [r5, #16]
+    __ vstr_f64(d6, Address(r1, 228));                 //   vstr.f64 d6, [r1, #228]
+
+    __ bind(near_flt);
+
+// vector memory
+    __ vldr_f32(f2, near_post_flt);                    //   vldr.f32 s2, near_post_flt
+    __ vstr_f32(f6, near_post_flt);                    //   vstr.f32 s6, near_post_flt
+
+// vector memory
+    __ vldr_f64(d2, near_flt, Assembler::LT);          //   vldrLT.f64 d2, near_flt
+    __ vstr_f64(d3, __ pc(), Assembler::GT);           //   vstrGT.f64 d3, .
+
+// vector memory
+    __ vldr_f32(f4, near_post_flt, Assembler::CC);     //   vldrCC.f32 s4, near_post_flt
+    __ vstr_f32(f0, near_post_flt);                    //   vstr.f32 s0, near_post_flt
+
+// vector memory
+    __ vldr_f64(d4, near_post_flt, Assembler::GT);     //   vldrGT.f64 d4, near_post_flt
+    __ vstr_f64(d0, near_flt);                         //   vstr.f64 d0, near_flt
+
+// vector memory
+    __ vldr_f32(f8, near_post_flt);                    //   vldr.f32 s8, near_post_flt
+    __ vstr_f32(f6, near_post_flt);                    //   vstr.f32 s6, near_post_flt
+
+// vector memory
+    __ vldr_f64(d4, near_flt, Assembler::PL);          //   vldrPL.f64 d4, near_flt
+    __ vstr_f64(d5, near_flt);                         //   vstr.f64 d5, near_flt
+
+// vector memory
+    __ vldr_f32(f8, near_post_flt, Assembler::LS);     //   vldrLS.f32 s8, near_post_flt
+    __ vstr_f32(f12, __ pc(), Assembler::CC);           //  vstrCC.f32 s12, .
+
+// vector memory
+    __ vldr_f64(d6, near_post_flt, Assembler::AL);     //   vldrAL.f64 d6, near_post_flt
+    __ vstr_f64(d1, near_post_flt, Assembler::LT);     //   vstrLT.f64 d1, near_post_flt
+
+    __ bind(near_post_flt);
+
+// FltMultMemOp
+    __ vldmia_f32(r1, FloatRegSet::of(f4).bits(), false);                      //   vldmia.f32 r1, {s4}
+    __ vstmia_f32(r6, FloatRegSet::of(f4).bits(), true, Assembler::CS);        //   vstmiaCS.f32 r6!, {s4}
+
+// DblMultMemOp
+    __ vldmia_f64(r9, DoubleFloatRegSet::of(d1, d2, d3, d4).bits(), true);     //   vldmia.f64 r9!, {d1, d2, d3, d4}
+    __ vstmia_f64(r3, DoubleFloatRegSet::of(d6, d7).bits(), true);             //   vstmia.f64 r3!, {d6, d7}
+
+// FltMultMemOp
+    __ vldmdb_f32(r2, FloatRegSet::of(f6).bits(), Assembler::VS);              //   vldmdbVS.f32 r2!, {s6}
+    __ vstmdb_f32(r6, FloatRegSet::of(f14).bits());                            //   vstmdb.f32 r6!, {s14}
+
+// DblMultMemOp
+    __ vldmdb_f64(sp, DoubleFloatRegSet::of(d4, d5, d6, d7).bits());           //   vldmdb.f64 sp!, {d4, d5, d6, d7}
+    __ vstmdb_f64(r0, DoubleFloatRegSet::of(d5, d6, d7).bits());               //   vstmdb.f64 r0!, {d5, d6, d7}
+
+// vcmp_f32
+    __ vcmp_f32(f2, f2);                               //   vcmp.f32 s2, s2
+
+// vcmpe_f32
+    __ vcmpe_f32(f8, f8, Assembler::VC);               //   vcmpeVC.f32 s8, s8
+
+// vcmp_f64
+    __ vcmp_f64(d0, d6);                               //   vcmp.f64 d0, d6
+
+// vcmpe_f64
+    __ vcmpe_f64(d3, d7, Assembler::GE);               //   vcmpeGE.f64 d3, d7
+
+// vcmp_f32
+    __ vcmp_f32(f2, 0.0f, Assembler::LT);              //   vcmpLT.f32 s2, #0.0
+
+// vcmpe_f32
+    __ vcmpe_f32(f14, 0.0f, Assembler::GT);             //  vcmpeGT.f32 s14, #0.0
+
+// vcmp_f64
+    __ vcmp_f64(d4, 0.0);                              //   vcmp.f64 d4, #0.0
+
+// vcmpe_f64
+    __ vcmpe_f64(d1, 0.0);                             //   vcmpe.f64 d1, #0.0
+
+// vcvt
+    __ vcvt_s32_f32(f2, f6, Assembler::VS);            //   vcvtVS.s32.f32 s2, s6
+    __ vcvt_u32_f32(f6, f14, Assembler::GT);            //  vcvtGT.u32.f32 s6, s14
+    __ vcvt_f32_s32(f0, f2, Assembler::CC);            //   vcvtCC.f32.s32 s0, s2
+    __ vcvt_f32_u32(f2, f4, Assembler::CC);            //   vcvtCC.f32.u32 s2, s4
+
+// vcvt
+    __ vcvt_s32_f64(f4, d4, Assembler::HI);            //   vcvtHI.s32.f64 s4, d4
+    __ vcvt_u32_f64(f6, d6, Assembler::HI);            //   vcvtHI.u32.f64 s6, d6
+    __ vcvt_f32_f64(f6, d7, Assembler::LS);            //   vcvtLS.f32.f64 s6, d7
+
+// vcvt
+    __ vcvt_f64_s32(d3, f8);                           //   vcvt.f64.s32 d3, s8
+    __ vcvt_f64_u32(d5, f14, Assembler::EQ);            //  vcvtEQ.f64.u32 d5, s14
+    __ vcvt_f64_f32(d4, f10, Assembler::AL);            //  vcvtAL.f64.f32 d4, s10
+
+// BKPT
+    __ bkpt((unsigned)26U);                            //   bkpt #26
+
+    __ bind(forth);
+
+/*
+aarch32ops.o:     file format elf32-littlearm
+
+
+Disassembly of section .text:
+
+00000000 <back>:
+   0:   e082852b    add r8, r2, fp, lsr #10
+   4:   009310c7    addseq  r1, r3, r7, asr #1
+   8:   e0290284    eor r0, r9, r4, lsl #5
+   c:   c0329066    eorsgt  r9, r2, r6, rrx
+  10:   c04c000e    subgt   r0, ip, lr
+  14:   00528364    subseq  r8, r2, r4, ror #6
+  18:   e069818d    rsb r8, r9, sp, lsl #3
+  1c:   60708864    rsbsvs  r8, r0, r4, ror #16
+  20:   d08597a1    addle   r9, r5, r1, lsr #15
+  24:   e09d12c6    adds    r1, sp, r6, asr #5
+  28:   c0adb0c7    adcgt   fp, sp, r7, asr #1
+  2c:   e0b80329    adcs    r0, r8, r9, lsr #6
+  30:   e0c392e6    sbc r9, r3, r6, ror #5
+  34:   80dd1845    sbcshi  r1, sp, r5, asr #16
+  38:   30e28486    rsccc   r8, r2, r6, lsl #9
+  3c:   e0f4a76d    rscs    sl, r4, sp, ror #14
+  40:   118db785    orrne   fp, sp, r5, lsl #15
+  44:   e19a9764    orrs    r9, sl, r4, ror #14
+  48:   e1cd90e5    bic r9, sp, r5, ror #1
+  4c:   e1d20547    bics    r0, r2, r7, asr #10
+  50:   e086d777    add sp, r6, r7, ror r7
+  54:   809c4776    addshi  r4, ip, r6, ror r7
+  58:   90265c57    eorls   r5, r6, r7, asr ip
+  5c:   e035841d    eors    r8, r5, sp, lsl r4
+  60:   e04c2055    sub r2, ip, r5, asr r0
+  64:   20539c17    subscs  r9, r3, r7, lsl ip
+  68:   c06c9614    rsbgt   r9, ip, r4, lsl r6
+  6c:   e072811c    rsbs    r8, r2, ip, lsl r1
+  70:   e08c4d1d    add r4, ip, sp, lsl sp
+  74:   e09b8d76    adds    r8, fp, r6, ror sp
+  78:   10a20415    adcne   r0, r2, r5, lsl r4
+  7c:   e0beb256    adcs    fp, lr, r6, asr r2
+  80:   80ca835e    sbchi   r8, sl, lr, asr r3
+  84:   e0dc1615    sbcs    r1, ip, r5, lsl r6
+  88:   60e54a7e    rscvs   r4, r5, lr, ror sl
+  8c:   e0fc181d    rscs    r1, ip, sp, lsl r8
+  90:   61818076    orrvs   r8, r1, r6, ror r0
+  94:   e19db577    orrs    fp, sp, r7, ror r5
+  98:   e1ce4216    bic r4, lr, r6, lsl r2
+  9c:   e1dba31d    bics    sl, fp, sp, lsl r3
+  a0:   828d8261    addhi   r8, sp, #268435462  ; 0x10000006
+  a4:   e29ed69b    adds    sp, lr, #162529280  ; 0x9b00000
+  a8:   e226e87d    eor lr, r6, #8192000    ; 0x7d0000
+  ac:   e2332f49    eors    r2, r3, #292    ; 0x124
+  b0:   e24d46d9    sub r4, sp, #227540992  ; 0xd900000
+  b4:   b25e1402    subslt  r1, lr, #33554432   ; 0x2000000
+  b8:   e2650325    rsb r0, r5, #-1811939328    ; 0x94000000
+  bc:   3274882f    rsbscc  r8, r4, #3080192    ; 0x2f0000
+  c0:   b2849102    addlt   r9, r4, #-2147483648    ; 0x80000000
+  c4:   e2948902    adds    r8, r4, #32768  ; 0x8000
+  c8:   22aeac2a    adccs   sl, lr, #10752  ; 0x2a00
+  cc:   e2b6aabd    adcs    sl, r6, #774144 ; 0xbd000
+  d0:   e2cc2426    sbc r2, ip, #637534208  ; 0x26000000
+  d4:   e2da85a5    sbcs    r8, sl, #692060160  ; 0x29400000
+  d8:   e2e6d871    rsc sp, r6, #7405568    ; 0x710000
+  dc:   12fba6e9    rscsne  sl, fp, #244318208  ; 0xe900000
+  e0:   638737ff    orrvs   r3, r7, #66846720   ; 0x3fc0000
+  e4:   03952951    orrseq  r2, r5, #1327104    ; 0x144000
+  e8:   63c18eea    bicvs   r8, r1, #3744   ; 0xea0
+  ec:   33d2020a    bicscc  r0, r2, #-1610612736    ; 0xa0000000
+  f0:   e118028d    tst r8, sp, lsl #5
+  f4:   e13601a7    teq r6, r7, lsr #3
+  f8:   e15c0164    cmp ip, r4, ror #2
+  fc:   b1750807    cmnlt   r5, r7, lsl #16
+ 100:   e112073e    tst r2, lr, lsr r7
+ 104:   31300572    teqcc   r0, r2, ror r5
+ 108:   915e0b37    cmpls   lr, r7, lsr fp
+ 10c:   617a0b17    cmnvs   sl, r7, lsl fp
+ 110:   e3120585    tst r2, #557842432  ; 0x21400000
+ 114:   433e071b    teqmi   lr, #7077888    ; 0x6c0000
+ 118:   e355030e    cmp r5, #939524096  ; 0x38000000
+ 11c:   3377010a    cmncc   r7, #-2147483646    ; 0x80000002
+ 120:   e1a00b84    lsl r0, r4, #23
+ 124:   e1b01484    lsls    r1, r4, #9
+ 128:   e1a001aa    lsr r0, sl, #3
+ 12c:   e1b00a2a    lsrs    r0, sl, #20
+ 130:   e1a015c9    asr r1, r9, #11
+ 134:   61b0254b    asrsvs  r2, fp, #10
+ 138:   31a08fe2    rorcc   r8, r2, #31
+ 13c:   e1b0946c    rors    r9, ip, #8
+ 140:   e1a0877e    ror r8, lr, r7
+ 144:   e1b0c473    rors    ip, r3, r4
+ 148:   c1a0ce1d    lslgt   ip, sp, lr
+ 14c:   e1b0c61d    lsls    ip, sp, r6
+ 150:   c1a00931    lsrgt   r0, r1, r9
+ 154:   c1b0bc33    lsrsgt  fp, r3, ip
+ 158:   d1a0265c    asrle   r2, ip, r6
+ 15c:   b1b0165a    asrslt  r1, sl, r6
+ 160:   e1a0a003    mov sl, r3
+ 164:   e1b00009    movs    r0, r9
+ 168:   73a03e29    movvc   r3, #656    ; 0x290
+ 16c:   e3b0497e    movs    r4, #2064384    ; 0x1f8000
+ 170:   e1a0c1a6    lsr ip, r6, #3
+ 174:   71b0554d    asrsvc  r5, sp, #10
+ 178:   e1a0137e    ror r1, lr, r3
+ 17c:   01b0897c    rorseq  r8, ip, r9
+ 180:   330cbf31    movwcc  fp, #53041  ; 0xcf31
+ 184:   33429bf7    movtcc  r9, #11255  ; 0x2bf7
+ 188:   d001059d    mulle   r1, sp, r5
+ 18c:   e0100b9a    muls    r0, sl, fp
+ 190:   e0207c93    mla r0, r3, ip, r7
+ 194:   0038639b    mlaseq  r8, fp, r3, r6
+ 198:   e084e695    umull   lr, r4, r5, r6
+ 19c:   e0940796    umulls  r0, r4, r6, r7
+ 1a0:   e0a08e9b    umlal   r8, r0, fp, lr
+ 1a4:   e0b4b79e    umlals  fp, r4, lr, r7
+ 1a8:   20c51796    smullcs r1, r5, r6, r7
+ 1ac:   40db059c    smullsmi    r0, fp, ip, r5
+ 1b0:   e0498592    umaal   r8, r9, r2, r5
+ 1b4:   0060ed94    mlseq   r0, r4, sp, lr
+ 1b8:   510d9054    qaddpl  r9, r4, sp
+ 1bc:   4125005c    qsubmi  r0, ip, r5
+ 1c0:   e1473055    qdadd   r3, r5, r7
+ 1c4:   e1649052    qdsub   r9, r2, r4
+ 1c8:   e101658c    smlabb  r1, ip, r5, r6
+ 1cc:   e1006cca    smlabt  r0, sl, ip, r6
+ 1d0:   e108e3a1    smlatb  r8, r1, r3, lr
+ 1d4:   e10176ed    smlatt  r1, sp, r6, r7
+ 1d8:   e1206483    smlawb  r0, r3, r4, r6
+ 1dc:   e12b7ec4    smlawt  fp, r4, lr, r7
+ 1e0:   e14a0786    smlalbb r0, sl, r6, r7
+ 1e4:   914b3ec4    smlalbtls   r3, fp, r4, lr
+ 1e8:   e14b8ca3    smlaltb r8, fp, r3, ip
+ 1ec:   e14185e3    smlaltt r8, r1, r3, r5
+ 1f0:   21220dac    smulwbcs    r2, ip, sp
+ 1f4:   e12806ec    smulwt  r8, ip, r6
+ 1f8:   a1620e86    smulbbge    r2, r6, lr
+ 1fc:   e16807cc    smulbt  r8, ip, r7
+ 200:   016a0ea3    smultbeq    sl, r3, lr
+ 204:   e1600de3    smultt  r0, r3, sp
+ 208:   e697a009    ldr sl, [r7], r9
+ 20c:   e5d900c4    ldrb    r0, [r9, #196]  ; 0xc4
+ 210:   e1b4e0b6    ldrh    lr, [r4, r6]!
+ 214:   e1f96ed8    ldrsb   r6, [r9, #232]! ; 0xe8
+ 218:   e09120f1    ldrsh   r2, [r1], r1
+ 21c:   e6890004    str r0, [r9], r4
+ 220:   e5e5305c    strb    r3, [r5, #92]!  ; 0x5c
+ 224:   e1c82ab0    strh    r2, [r8, #160]  ; 0xa0
+ 228:   e79c8008    ldr r8, [ip, r8]
+ 22c:   e4dab010    ldrb    fp, [sl], #16
+ 230:   e19ab0b6    ldrh    fp, [sl, r6]
+ 234:   e1bb50da    ldrsb   r5, [fp, sl]!
+ 238:   e19360f7    ldrsh   r6, [r3, r7]
+ 23c:   e7ad7005    str r7, [sp, r5]!
+ 240:   e5ca2000    strb    r2, [sl]
+ 244:   e08460b3    strh    r6, [r4], r3
+ 248:   e59ca000    ldr sl, [ip]
+ 24c:   e4db4084    ldrb    r4, [fp], #132  ; 0x84
+ 250:   e09990bc    ldrh    r9, [r9], ip
+ 254:   e0d399d4    ldrsb   r9, [r3], #148  ; 0x94
+ 258:   e1f2b9f4    ldrsh   fp, [r2, #148]! ; 0x94
+ 25c:   e78db00b    str fp, [sp, fp]
+ 260:   e7cd100a    strb    r1, [sp, sl]
+ 264:   e08ea0b9    strh    sl, [lr], r9
+ 268:   e7b36004    ldr r6, [r3, r4]!
+ 26c:   e7f6400d    ldrb    r4, [r6, sp]!
+ 270:   e09760ba    ldrh    r6, [r7], sl
+ 274:   e1b600db    ldrsb   r0, [r6, fp]!
+ 278:   e096a0fd    ldrsh   sl, [r6], sp
+ 27c:   e783700c    str r7, [r3, ip]
+ 280:   e7e83001    strb    r3, [r8, r1]!
+ 284:   e1cc44b0    strh    r4, [ip, #64]   ; 0x40
+
+00000288 <near>:
+ 288:   e51f1008    ldr r1, [pc, #-8]   ; 288 <near>
+ 28c:   e55f7008    ldrb    r7, [pc, #-8]   ; 28c <near+0x4>
+ 290:   e15f21b0    ldrh    r2, [pc, #-16]  ; 288 <near>
+ 294:   e15fa0d8    ldrsb   sl, [pc, #-8]   ; 294 <near+0xc>
+ 298:   e1dfe3f8    ldrsh   lr, [pc, #56]   ; 2d8 <near_post>
+ 29c:   e51f2008    ldr r2, [pc, #-8]   ; 29c <near+0x14>
+ 2a0:   e55f3008    ldrb    r3, [pc, #-8]   ; 2a0 <near+0x18>
+ 2a4:   e1df72bc    ldrh    r7, [pc, #44]   ; 2d8 <near_post>
+ 2a8:   e15fd0d8    ldrsb   sp, [pc, #-8]   ; 2a8 <near+0x20>
+ 2ac:   e15fa2fc    ldrsh   sl, [pc, #-44]  ; 288 <near>
+ 2b0:   e51f5008    ldr r5, [pc, #-8]   ; 2b0 <near+0x28>
+ 2b4:   e5dfe01c    ldrb    lr, [pc, #28]   ; 2d8 <near_post>
+ 2b8:   e1df51b8    ldrh    r5, [pc, #24]   ; 2d8 <near_post>
+ 2bc:   e15f63dc    ldrsb   r6, [pc, #-60]  ; 288 <near>
+ 2c0:   e15fb4f0    ldrsh   fp, [pc, #-64]  ; 288 <near>
+ 2c4:   e59f700c    ldr r7, [pc, #12]   ; 2d8 <near_post>
+ 2c8:   e5df5008    ldrb    r5, [pc, #8]    ; 2d8 <near_post>
+ 2cc:   e15fa4bc    ldrh    sl, [pc, #-76]  ; 288 <near>
+ 2d0:   e1df60d0    ldrsb   r6, [pc]    ; 2d8 <near_post>
+ 2d4:   e15f90f8    ldrsh   r9, [pc, #-8]   ; 2d4 <near+0x4c>
+
+000002d8 <near_post>:
+ 2d8:   e690036a    ldr r0, [r0], sl, ror #6
+ 2dc:   e7d8348e    ldrb    r3, [r8, lr, lsl #9]
+ 2e0:   e78d5783    str r5, [sp, r3, lsl #15]
+ 2e4:   e6c99145    strb    r9, [r9], r5, asr #2
+ 2e8:   e7945360    ldr r5, [r4, r0, ror #6]
+ 2ec:   e7d0e4a4    ldrb    lr, [r0, r4, lsr #9]
+ 2f0:   e68c52cc    str r5, [ip], ip, asr #5
+ 2f4:   e7e13667    strb    r3, [r1, r7, ror #12]!
+ 2f8:   e7b26063    ldr r6, [r2, r3, rrx]!
+ 2fc:   e7fe8842    ldrb    r8, [lr, r2, asr #16]!
+ 300:   e7a363e6    str r6, [r3, r6, ror #7]!
+ 304:   e7c83502    strb    r3, [r8, r2, lsl #10]
+ 308:   e79db40e    ldr fp, [sp, lr, lsl #8]
+ 30c:   e7fda20c    ldrb    sl, [sp, ip, lsl #4]!
+ 310:   e789d142    str sp, [r9, r2, asr #2]
+ 314:   e7eb774e    strb    r7, [fp, lr, asr #14]!
+ 318:   e19bcf9f    ldrex   r12, [fp]
+ 31c:   e1dc4f9f    ldrexb  r4, [ip]
+ 320:   e1fbbf9f    ldrexh  fp, [fp]
+ 324:   e18e1f97    strex   r1, r7, [lr]
+ 328:   e1c4cf96    strexb  ip, r6, [r4]
+ 32c:   21e74f96    strexhcs    r4, r6, [r7]
+ 330:   e6143f17    sadd16  r3, r4, r7
+ 334:   e61a9f33    sasx    r9, sl, r3
+ 338:   e615cf56    ssax    ip, r5, r6
+ 33c:   e615cf7e    ssub16  ip, r5, lr
+ 340:   e61a0f97    sadd8   r0, sl, r7
+ 344:   66180ff2    ssub8vs r0, r8, r2
+ 348:   5624bf15    qadd16pl    fp, r4, r5
+ 34c:   6623bf3c    qasxvs  fp, r3, ip
+ 350:   e6230f55    qsax    r0, r3, r5
+ 354:   e61caf75    ssub16  sl, ip, r5
+ 358:   3626af9e    qadd8cc sl, r6, lr
+ 35c:   e62baff7    qsub8   sl, fp, r7
+ 360:   56349f1e    shadd16pl   r9, r4, lr
+ 364:   e63e1f37    shasx   r1, lr, r7
+ 368:   363b9f55    shsaxcc r9, fp, r5
+ 36c:   a6313f7b    shsub16ge   r3, r1, fp
+ 370:   c635df97    shadd8gt    sp, r5, r7
+ 374:   e6351ff7    shsub8  r1, r5, r7
+ 378:   e654af17    uadd16  sl, r4, r7
+ 37c:   26591f37    uasxcs  r1, r9, r7
+ 380:   e65dbf57    usax    fp, sp, r7
+ 384:   e654bf7e    usub16  fp, r4, lr
+ 388:   365d2f97    uadd8cc r2, sp, r7
+ 38c:   c65a8ffe    usub8gt r8, sl, lr
+ 390:   e66c3f1d    uqadd16 r3, ip, sp
+ 394:   e66d4f36    uqasx   r4, sp, r6
+ 398:   e66a1f5e    uqsax   r1, sl, lr
+ 39c:   d66d2f7e    uqsub16le   r2, sp, lr
+ 3a0:   e66c1f95    uqadd8  r1, ip, r5
+ 3a4:   c6640ffd    uqsub8gt    r0, r4, sp
+ 3a8:   867a0f15    uhadd16hi   r0, sl, r5
+ 3ac:   d674bf37    uhasxle fp, r4, r7
+ 3b0:   a67e1f59    uhsaxge r1, lr, r9
+ 3b4:   e67b2f7e    uhsub16 r2, fp, lr
+ 3b8:   a6749f95    uhadd8ge    r9, r4, r5
+ 3bc:   867d2ffe    uhsub8hi    r2, sp, lr
+ 3c0:   e683a877    sxtab16 sl, r3, r7, ror #16
+ 3c4:   26a59c77    sxtabcs r9, r5, r7, ror #24
+ 3c8:   e6b53477    sxtah   r3, r5, r7, ror #8
+ 3cc:   e6c48476    uxtab16 r8, r4, r6, ror #8
+ 3d0:   06eb007d    uxtabeq r0, fp, sp
+ 3d4:   e6fc9075    uxtah   r9, ip, r5
+ 3d8:   a68f387b    sxtb16ge    r3, fp, ror #16
+ 3dc:   86af2076    sxtbhi  r2, r6
+ 3e0:   c6bf3c7d    sxthgt  r3, sp, ror #24
+ 3e4:   e6cfc875    uxtb16  ip, r5, ror #16
+ 3e8:   e6efc875    uxtb    ip, r5, ror #16
+ 3ec:   e6ff8875    uxth    r8, r5, ror #16
+ 3f0:   06bfaf34    reveq   sl, r4
+ 3f4:   a6bf8fbc    rev16ge r8, ip
+ 3f8:   e6ffef37    rbit    lr, r7
+ 3fc:   c6ffdfb7    revshgt sp, r7
+ 400:   e719fe1d    sdiv    r9, sp, lr
+ 404:   e732f61c    udiv    r2, ip, r6
+ 408:   47a20a51    sbfxmi  r0, r1, #20, #3
+ 40c:   e7ee9852    ubfx    r9, r2, #16, #15
+ 410:   87dd1d9b    bfihi   r1, fp, #27, #3
+ 414:   e7d0339f    bfc r3, #7, #10
+ 418:   e8060d6b    stmda   r6, {r0, r1, r3, r5, r6, r8, sl, fp}
+ 41c:   e80438df    stmda   r4, {r0, r1, r2, r3, r4, r6, r7, fp, ip, sp}
+ 420:   e810e1d4    ldmda   r0, {r2, r4, r6, r7, r8, sp, lr, pc}
+ 424:   e83c9873    ldmda   ip!, {r0, r1, r4, r5, r6, fp, ip, pc}
+ 428:   e8a931bd    stmia   r9!, {r0, r2, r3, r4, r5, r7, r8, ip, sp}
+ 42c:   e88b55c3    stm fp, {r0, r1, r6, r7, r8, sl, ip, lr}
+ 430:   e8bcbd22    ldm ip!, {r1, r5, r8, sl, fp, ip, sp, pc}
+ 434:   e8bda10a    pop {r1, r3, r8, sp, pc}
+ 438:   e92b2219    stmdb   fp!, {r0, r3, r4, r9, sp}
+ 43c:   e9298dd5    stmdb   r9!, {r0, r2, r4, r6, r7, r8, sl, fp, pc}
+ 440:   e935605b    ldmdb   r5!, {r0, r1, r3, r4, r6, sp, lr}
+ 444:   e91191a7    ldmdb   r1, {r0, r1, r2, r5, r7, r8, ip, pc}
+ 448:   e9ab6e6a    stmib   fp!, {r1, r3, r5, r6, r9, sl, fp, sp, lr}
+ 44c:   e98b4507    stmib   fp, {r0, r1, r2, r8, sl, lr}
+ 450:   e9b053cc    ldmib   r0!, {r2, r3, r6, r7, r8, r9, ip, lr}
+ 454:   e9912de7    ldmib   r1, {r0, r1, r2, r5, r6, r7, r8, sl, fp, sp}
+ 458:   2a000075    bcs 634 <forth>
+ 45c:   4bfffffe    blmi    45c <near_post+0x184>
+ 460:   612fff10    bxvs    r0
+ 464:   e12fff33    blx r3
+ 468:   eafffffe    b   468 <near_post+0x190>
+ 46c:   ebfffffe    bl  46c <near_post+0x194>
+ 470:   612fff10    bxvs    r0
+ 474:   e12fff35    blx r5
+ 478:   da00006d    ble 634 <forth>
+ 47c:   4bfffffe    blmi    47c <near_post+0x1a4>
+ 480:   112fff19    bxne    r9
+ 484:   e12fff3c    blx ip
+ 488:   eafffedc    b   0 <back>
+ 48c:   8bfffffe    blhi    48c <near_post+0x1b4>
+ 490:   712fff11    bxvc    r1
+ 494:   c12fff37    blxgt   r7
+ 498:   aafffed8    bge 0 <back>
+ 49c:   8bfffffe    blhi    49c <near_post+0x1c4>
+ 4a0:   e12fff1c    bx  ip
+ 4a4:   312fff37    blxcc   r7
+ 4a8:   eafffffe    b   4a8 <near_post+0x1d0>
+ 4ac:   cbfffed3    blgt    0 <back>
+ 4b0:   a12fff11    bxge    r1
+ 4b4:   e12fff30    blx r0
+ 4b8:   eafffffe    b   4b8 <near_post+0x1e0>
+ 4bc:   eb00005c    bl  634 <forth>
+ 4c0:   c12fff1e    bxgt    lr
+ 4c4:   112fff3b    blxne   fp
+ 4c8:   2afffffe    bcs 4c8 <near_post+0x1f0>
+ 4cc:   ebfffffe    bl  4cc <near_post+0x1f4>
+ 4d0:   212fff1a    bxcs    sl
+ 4d4:   e12fff34    blx r4
+ 4d8:   eafffec8    b   0 <back>
+ 4dc:   ebfffffe    bl  4dc <near_post+0x204>
+ 4e0:   312fff1c    bxcc    ip
+ 4e4:   e12fff38    blx r8
+ 4e8:   ea000051    b   634 <forth>
+ 4ec:   ebfffffe    bl  4ec <near_post+0x214>
+ 4f0:   e12fff1a    bx  sl
+ 4f4:   e12fff31    blx r1
+ 4f8:   4e042a06    vmlami.f32  s4, s8, s12
+ 4fc:   ee052a45    vmls.f32    s4, s10, s10
+ 500:   ee151a46    vnmla.f32   s2, s10, s12
+ 504:   be134a04    vnmlslt.f32 s8, s6, s8
+ 508:   4e263a47    vnmulmi.f32 s6, s12, s14
+ 50c:   ee310a00    vadd.f32    s0, s2, s0
+ 510:   ee321a45    vsub.f32    s2, s4, s10
+ 514:   2e810a06    vdivcs.f32  s0, s2, s12
+ 518:   ee030b06    vmla.f64    d0, d3, d6
+ 51c:   ee010b45    vmls.f64    d0, d1, d5
+ 520:   ee141b46    vnmla.f64   d1, d4, d6
+ 524:   1e110b01    vnmlsne.f64 d0, d1, d1
+ 528:   1e253b45    vnmulne.f64 d3, d5, d5
+ 52c:   3e320b04    vaddcc.f64  d0, d2, d4
+ 530:   ee321b44    vsub.f64    d1, d2, d4
+ 534:   4e810b05    vdivmi.f64  d0, d1, d5
+ 538:   eeb03ac3    vabs.f32    s6, s6
+ 53c:   5eb13a44    vnegpl.f32  s6, s8
+ 540:   eeb10ac4    vsqrt.f32   s0, s8
+ 544:   eeb00bc4    vabs.f64    d0, d4
+ 548:   eeb11b44    vneg.f64    d1, d4
+ 54c:   eeb10bc1    vsqrt.f64   d0, d1
+ 550:   5e00ea10    vmovpl  s0, lr
+ 554:   ee14ba10    vmov    fp, s8
+ 558:   bc4ebb11    vmovlt  d1, fp, lr
+ 55c:   ec557b15    vmov    r7, r5, d5
+ 560:   eeb04a46    vmov.f32    s8, s12
+ 564:   8eb01b42    vmovhi.f64  d1, d2
+ 568:   6eb72a00    vmovvs.f32  s4, #112    ; 0x70
+ 56c:   eeb72b00    vmov.f64    d2, #112    ; 0x70
+ 570:   eeb03a00    vmov.f32    s6, #0
+ 574:   eeb01b00    vmov.f64    d1, #0
+ 578:   ed952a1d    vldr    s4, [r5, #116]  ; 0x74
+ 57c:   3d811a0e    vstrcc  s2, [r1, #56]   ; 0x38
+ 580:   1d957b04    vldrne  d7, [r5, #16]
+ 584:   ed816b39    vstr    d6, [r1, #228]  ; 0xe4
+
+00000588 <near_flt>:
+ 588:   ed9f1a0e    vldr    s2, [pc, #56]   ; 5c8 <near_post_flt>
+ 58c:   ed8f3a0d    vstr    s6, [pc, #52]   ; 5c8 <near_post_flt>
+ 590:   bd1f2b04    vldrlt  d2, [pc, #-16]  ; 588 <near_flt>
+ 594:   cd0f3b02    vstrgt  d3, [pc, #-8]   ; 594 <near_flt+0xc>
+ 598:   3d9f2a0a    vldrcc  s4, [pc, #40]   ; 5c8 <near_post_flt>
+ 59c:   ed8f0a09    vstr    s0, [pc, #36]   ; 5c8 <near_post_flt>
+ 5a0:   cd9f4b08    vldrgt  d4, [pc, #32]   ; 5c8 <near_post_flt>
+ 5a4:   ed0f0b09    vstr    d0, [pc, #-36]  ; 588 <near_flt>
+ 5a8:   ed9f4a06    vldr    s8, [pc, #24]   ; 5c8 <near_post_flt>
+ 5ac:   ed8f3a05    vstr    s6, [pc, #20]   ; 5c8 <near_post_flt>
+ 5b0:   5d1f4b0c    vldrpl  d4, [pc, #-48]  ; 588 <near_flt>
+ 5b4:   ed0f5b0d    vstr    d5, [pc, #-52]  ; 588 <near_flt>
+ 5b8:   9d9f4a02    vldrls  s8, [pc, #8]    ; 5c8 <near_post_flt>
+ 5bc:   3d0f6a02    vstrcc  s12, [pc, #-8]  ; 5bc <near_flt+0x34>
+ 5c0:   ed9f6b00    vldr    d6, [pc]    ; 5c8 <near_post_flt>
+ 5c4:   bd0f1b01    vstrlt  d1, [pc, #-4]   ; 5c8 <near_post_flt>
+
+000005c8 <near_post_flt>:
+ 5c8:   ec912a01    vldmia  r1, {s4}
+ 5cc:   2ca62a01    vstmiacs    r6!, {s4}
+ 5d0:   ecb91b08    vldmia  r9!, {d1-d4}
+ 5d4:   eca36b04    vstmia  r3!, {d6-d7}
+ 5d8:   6d323a01    vldmdbvs    r2!, {s6}
+ 5dc:   ed267a01    vstmdb  r6!, {s14}
+ 5e0:   ed3d4b08    vldmdb  sp!, {d4-d7}
+ 5e4:   ed205b06    vstmdb  r0!, {d5-d7}
+ 5e8:   eeb41a41    vcmp.f32    s2, s2
+ 5ec:   7eb44ac4    vcmpevc.f32 s8, s8
+ 5f0:   eeb40b46    vcmp.f64    d0, d6
+ 5f4:   aeb43bc7    vcmpege.f64 d3, d7
+ 5f8:   beb51a40    vcmplt.f32  s2, #0.0
+ 5fc:   ceb57ac0    vcmpegt.f32 s14, #0.0
+ 600:   eeb54b40    vcmp.f64    d4, #0.0
+ 604:   eeb51bc0    vcmpe.f64   d1, #0.0
+ 608:   6ebd1ac3    vcvtvs.s32.f32  s2, s6
+ 60c:   cebc3ac7    vcvtgt.u32.f32  s6, s14
+ 610:   3eb80ac1    vcvtcc.f32.s32  s0, s2
+ 614:   3eb81a42    vcvtcc.f32.u32  s2, s4
+ 618:   8ebd2bc4    vcvthi.s32.f64  s4, d4
+ 61c:   8ebc3bc6    vcvthi.u32.f64  s6, d6
+ 620:   9eb73bc7    vcvtls.f32.f64  s6, d7
+ 624:   eeb83bc4    vcvt.f64.s32    d3, s8
+ 628:   0eb85b47    vcvteq.f64.u32  d5, s14
+ 62c:   eeb74ac5    vcvt.f64.f32    d4, s10
+ 630:   e120017a    bkpt    0x001a
+ */
+
+  static const unsigned int insns[] =
+  {
+    0xe082852b,     0x009310c7,     0xe0290284,     0xc0329066,
+    0xc04c000e,     0x00528364,     0xe069818d,     0x60708864,
+    0xd08597a1,     0xe09d12c6,     0xc0adb0c7,     0xe0b80329,
+    0xe0c392e6,     0x80dd1845,     0x30e28486,     0xe0f4a76d,
+    0x118db785,     0xe19a9764,     0xe1cd90e5,     0xe1d20547,
+    0xe086d777,     0x809c4776,     0x90265c57,     0xe035841d,
+    0xe04c2055,     0x20539c17,     0xc06c9614,     0xe072811c,
+    0xe08c4d1d,     0xe09b8d76,     0x10a20415,     0xe0beb256,
+    0x80ca835e,     0xe0dc1615,     0x60e54a7e,     0xe0fc181d,
+    0x61818076,     0xe19db577,     0xe1ce4216,     0xe1dba31d,
+    0x828d8261,     0xe29ed69b,     0xe226e87d,     0xe2332f49,
+    0xe24d46d9,     0xb25e1402,     0xe2650325,     0x3274882f,
+    0xb2849102,     0xe2948902,     0x22aeac2a,     0xe2b6aabd,
+    0xe2cc2426,     0xe2da85a5,     0xe2e6d871,     0x12fba6e9,
+    0x638737ff,     0x03952951,     0x63c18eea,     0x33d2020a,
+    0xe118028d,     0xe13601a7,     0xe15c0164,     0xb1750807,
+    0xe112073e,     0x31300572,     0x915e0b37,     0x617a0b17,
+    0xe3120585,     0x433e071b,     0xe355030e,     0x3377010a,
+    0xe1a00b84,     0xe1b01484,     0xe1a001aa,     0xe1b00a2a,
+    0xe1a015c9,     0x61b0254b,     0x31a08fe2,     0xe1b0946c,
+    0xe1a0877e,     0xe1b0c473,     0xc1a0ce1d,     0xe1b0c61d,
+    0xc1a00931,     0xc1b0bc33,     0xd1a0265c,     0xb1b0165a,
+    0xe1a0a003,     0xe1b00009,     0x73a03e29,     0xe3b0497e,
+    0xe1a0c1a6,     0x71b0554d,     0xe1a0137e,     0x01b0897c,
+    0x330cbf31,     0x33429bf7,     0xd001059d,     0xe0100b9a,
+    0xe0207c93,     0x0038639b,     0xe084e695,     0xe0940796,
+    0xe0a08e9b,     0xe0b4b79e,     0x20c51796,     0x40db059c,
+    0xe0498592,     0x0060ed94,     0x510d9054,     0x4125005c,
+    0xe1473055,     0xe1649052,     0xe101658c,     0xe1006cca,
+    0xe108e3a1,     0xe10176ed,     0xe1206483,     0xe12b7ec4,
+    0xe14a0786,     0x914b3ec4,     0xe14b8ca3,     0xe14185e3,
+    0x21220dac,     0xe12806ec,     0xa1620e86,     0xe16807cc,
+    0x016a0ea3,     0xe1600de3,     0xe697a009,     0xe5d900c4,
+    0xe1b4e0b6,     0xe1f96ed8,     0xe09120f1,     0xe6890004,
+    0xe5e5305c,     0xe1c82ab0,     0xe79c8008,     0xe4dab010,
+    0xe19ab0b6,     0xe1bb50da,     0xe19360f7,     0xe7ad7005,
+    0xe5ca2000,     0xe08460b3,     0xe59ca000,     0xe4db4084,
+    0xe09990bc,     0xe0d399d4,     0xe1f2b9f4,     0xe78db00b,
+    0xe7cd100a,     0xe08ea0b9,     0xe7b36004,     0xe7f6400d,
+    0xe09760ba,     0xe1b600db,     0xe096a0fd,     0xe783700c,
+    0xe7e83001,     0xe1cc44b0,     0xe51f1008,     0xe55f7008,
+    0xe15f21b0,     0xe15fa0d8,     0xe1dfe3f8,     0xe51f2008,
+    0xe55f3008,     0xe1df72bc,     0xe15fd0d8,     0xe15fa2fc,
+    0xe51f5008,     0xe5dfe01c,     0xe1df51b8,     0xe15f63dc,
+    0xe15fb4f0,     0xe59f700c,     0xe5df5008,     0xe15fa4bc,
+    0xe1df60d0,     0xe15f90f8,     0xe690036a,     0xe7d8348e,
+    0xe78d5783,     0xe6c99145,     0xe7945360,     0xe7d0e4a4,
+    0xe68c52cc,     0xe7e13667,     0xe7b26063,     0xe7fe8842,
+    0xe7a363e6,     0xe7c83502,     0xe79db40e,     0xe7fda20c,
+    0xe789d142,     0xe7eb774e,     0xe19bcf9f,     0xe1dc4f9f,
+    0xe1fbbf9f,     0xe18e1f97,     0xe1c4cf96,     0x21e74f96,
+    0xe6143f17,     0xe61a9f33,     0xe615cf56,     0xe615cf7e,
+    0xe61a0f97,     0x66180ff2,     0x5624bf15,     0x6623bf3c,
+    0xe6230f55,     0xe61caf75,     0x3626af9e,     0xe62baff7,
+    0x56349f1e,     0xe63e1f37,     0x363b9f55,     0xa6313f7b,
+    0xc635df97,     0xe6351ff7,     0xe654af17,     0x26591f37,
+    0xe65dbf57,     0xe654bf7e,     0x365d2f97,     0xc65a8ffe,
+    0xe66c3f1d,     0xe66d4f36,     0xe66a1f5e,     0xd66d2f7e,
+    0xe66c1f95,     0xc6640ffd,     0x867a0f15,     0xd674bf37,
+    0xa67e1f59,     0xe67b2f7e,     0xa6749f95,     0x867d2ffe,
+    0xe683a877,     0x26a59c77,     0xe6b53477,     0xe6c48476,
+    0x06eb007d,     0xe6fc9075,     0xa68f387b,     0x86af2076,
+    0xc6bf3c7d,     0xe6cfc875,     0xe6efc875,     0xe6ff8875,
+    0x06bfaf34,     0xa6bf8fbc,     0xe6ffef37,     0xc6ffdfb7,
+    0xe719fe1d,     0xe732f61c,     0x47a20a51,     0xe7ee9852,
+    0x87dd1d9b,     0xe7d0339f,     0xe8060d6b,     0xe80438df,
+    0xe810e1d4,     0xe83c9873,     0xe8a931bd,     0xe88b55c3,
+    0xe8bcbd22,     0xe8bda10a,     0xe92b2219,     0xe9298dd5,
+    0xe935605b,     0xe91191a7,     0xe9ab6e6a,     0xe98b4507,
+    0xe9b053cc,     0xe9912de7,     0x2a000075,     0x4bfffffe,
+    0x612fff10,     0xe12fff33,     0xeafffffe,     0xebfffffe,
+    0x612fff10,     0xe12fff35,     0xda00006d,     0x4bfffffe,
+    0x112fff19,     0xe12fff3c,     0xeafffedc,     0x8bfffffe,
+    0x712fff11,     0xc12fff37,     0xaafffed8,     0x8bfffffe,
+    0xe12fff1c,     0x312fff37,     0xeafffffe,     0xcbfffed3,
+    0xa12fff11,     0xe12fff30,     0xeafffffe,     0xeb00005c,
+    0xc12fff1e,     0x112fff3b,     0x2afffffe,     0xebfffffe,
+    0x212fff1a,     0xe12fff34,     0xeafffec8,     0xebfffffe,
+    0x312fff1c,     0xe12fff38,     0xea000051,     0xebfffffe,
+    0xe12fff1a,     0xe12fff31,     0x4e042a06,     0xee052a45,
+    0xee151a46,     0xbe134a04,     0x4e263a47,     0xee310a00,
+    0xee321a45,     0x2e810a06,     0xee030b06,     0xee010b45,
+    0xee141b46,     0x1e110b01,     0x1e253b45,     0x3e320b04,
+    0xee321b44,     0x4e810b05,     0xeeb03ac3,     0x5eb13a44,
+    0xeeb10ac4,     0xeeb00bc4,     0xeeb11b44,     0xeeb10bc1,
+    0x5e00ea10,     0xee14ba10,     0xbc4ebb11,     0xec557b15,
+    0xeeb04a46,     0x8eb01b42,     0x6eb72a00,     0xeeb72b00,
+    0xeeb03a00,     0xeeb01b00,     0xed952a1d,     0x3d811a0e,
+    0x1d957b04,     0xed816b39,     0xed9f1a0e,     0xed8f3a0d,
+    0xbd1f2b04,     0xcd0f3b02,     0x3d9f2a0a,     0xed8f0a09,
+    0xcd9f4b08,     0xed0f0b09,     0xed9f4a06,     0xed8f3a05,
+    0x5d1f4b0c,     0xed0f5b0d,     0x9d9f4a02,     0x3d0f6a02,
+    0xed9f6b00,     0xbd0f1b01,     0xec912a01,     0x2ca62a01,
+    0xecb91b08,     0xeca36b04,     0x6d323a01,     0xed267a01,
+    0xed3d4b08,     0xed205b06,     0xeeb41a41,     0x7eb44ac4,
+    0xeeb40b46,     0xaeb43bc7,     0xbeb51a40,     0xceb57ac0,
+    0xeeb54b40,     0xeeb51bc0,     0x6ebd1ac3,     0xcebc3ac7,
+    0x3eb80ac1,     0x3eb81a42,     0x8ebd2bc4,     0x8ebc3bc6,
+    0x9eb73bc7,     0xeeb83bc4,     0x0eb85b47,     0xeeb74ac5,
+    0xe120017a,
+  };
+// END  Generated code -- do not edit
+
+  // reset the detected cpu feature set
+  VM_Version::features(detected_features);
+
+  {
+    bool ok = true;
+    unsigned int *insns1 = (unsigned int *)entry;
+    for (unsigned int i = 0; i < sizeof insns / sizeof insns[0]; i++) {
+      if (insns[i] != insns1[i]) {
+        ok = false;
+        printf("Ours:\n");
+        Disassembler::decode((address)&insns1[i], (address)&insns1[i+1]);
+        printf("  Raw: 0x%x\n", insns1[i]);
+        printf("Theirs:\n");
+        Disassembler::decode((address)&insns[i], (address)&insns[i+1]);
+        printf("  Raw: 0x%x\n", insns[i]);
+        printf("\n");
+      }
+    }
+    assert(ok, "Assembler smoke test failed");
+  }
+#endif // ASSERT
+}
+
+#undef __
+void Address::AddressConstruct(Register base, RegisterOrConstant index, enum reg_op op,
+                               shift_op shift, enum wb_mode mode) {
+  _base = base;
+  _wb_mode = mode;
+  _shift = shift;
+  _target = 0;
+  if (index.is_register()) {
+    _acc_mode = reg;
+    _index = index.as_register();
+    _offset = 0;
+    _as_op = op;
+  } else {
+    assert(shift == lsl(), "should be");
+    assert(index.is_constant(), "should be");
+    _acc_mode = imm;
+    // _index = no_reg;
+    _offset = index.as_constant();
+    if(SUB == _as_op)
+      _offset = -_offset;
+  }
+}
+
+void Address::encode(Instruction_aarch32 *i, CodeSection *sec, address pc) const {
+  long offset = _offset;
+  access_mode mode = _acc_mode;
+
+  if(lit == mode) {
+    //Create the offset from the address
+    offset = _target - pc;
+    mode = imm;
+  }
+
+  //Correct the offset if the base is the PC
+  if(r15_pc == _base && imm == mode) {
+    offset -= 8;
+  }
+
+  int U = (offset >= 0 && _acc_mode == imm) || (_as_op == ADD && _acc_mode == reg);
+  int P = pre == _wb_mode || off == _wb_mode;
+  int W = pre == _wb_mode;
+  i->f(P, 24), i->f(U, 23), i->f(W, 21), i->rf(_base, 16);
+
+  offset = offset < 0 ? -offset : offset;
+  int opc = i->get(27, 25);
+
+  if (imm == mode) {
+    switch(opc) {
+    case 0b010:
+      // LDR, LDRB
+      // STR, STRB
+      i->f(offset, 11, 0);
+      break;
+    case 0b000:
+      // LDRH, LDRSH, LDRSB, LDRD
+      // STRH, STRD
+      i->f(1, 22);
+      assert(offset < (1 << 8), "Offset larger than a byte");
+      i->f(offset & 0xF, 3, 0);
+      i->f(offset >> 4, 11, 8);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  } else if (reg == mode) {
+    assert(r15_pc->encoding_nocheck() !=
+            _base->encoding_nocheck(), "Remove this if you have your offsets right");
+    switch(opc) {
+    case 0b010:
+      // LDR, LDRB
+      // STR, STRB
+      //Need to set bit 25 as Register 0b011
+      i->f(1, 25);
+      i->f(_shift.shift(), 11, 7);
+      i->f(_shift.kind(), 6, 5);
+      i->f(0, 4);
+      i->rf(_index, 0);
+      break;
+    case 0b000:
+      // LDRH, LDRSH, LDRSB, LDRD
+      // STRH, STRD
+      //Need to clear bit 22 as Register
+      i->f(0, 22);
+      assert(_shift == lsl(), "Type of load/store does not support shift");
+      i->f(0b0000, 11, 8);
+      i->rf(_index, 0);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+
+  if(lit == _acc_mode) {
+    sec->relocate(pc, _rspec);
+  }
+}
+
+void Address::fp_encode(Instruction_aarch32 *i, CodeSection *sec, address pc) const {
+  // ATM works only for immediate
+  assert(_wb_mode == off, "Can't do pre or post addressing for vldr, vstr");
+  long offset = _offset;
+  if(imm == _acc_mode) {
+    if(r15_pc == _base) {
+      //Correct the offset if the base is the PC
+      offset -= 8;
+    }
+    bool U = offset >= 0;
+    assert(0 == (offset & 3), "Can only access aligned data");
+    unsigned imm8 = uabs(offset) / 4;
+    i->f(U, 23), i->rf(_base, 16), i->f(imm8, 7, 0);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+#define __ as->
+void Address::lea(MacroAssembler *as, Register r) const {
+  Relocation* reloc = _rspec.reloc();
+  relocInfo::relocType rtype = (relocInfo::relocType) reloc->type();
+
+  //TODO Potentially remove this - added as aarch64 doesn't contain
+  // any method of handling pre or post
+  assert( _wb_mode != pre && _wb_mode != post, "Wrong wb mode");
+  // could probably permit post however
+  switch(_acc_mode) {
+  case imm: {
+    if (_offset == 0 && _base == r) // it's a nop
+      break;
+    if (_offset > 0)
+      __ add(r, _base, _offset);
+    else
+      __ sub(r, _base, -_offset);
+      break;
+  }
+  case reg: {
+    switch (_as_op) {
+      case ADD:
+        __ add(r, _base, _index, _shift);
+        break;
+      case SUB:
+        __ sub(r, _base, _index, _shift);
+        break;
+    }
+    break;
+  }
+  case lit: {
+    if (rtype == relocInfo::none)
+      __ mov(r, target());
+    else
+      __ movptr(r, (uint32_t)target());
+    break;
+  }
+  default:
+    ShouldNotReachHere();
+  }
+}
+#undef __
+
+#define __ as->
+class Address;
+
+// Adapts given Address to the capabilities of instructions respective to the
+// provided data type. E.g. some of the instructions cannot use index register
+// while others cannot have an offset field.
+// Returns a copy of this Address is it's good or constructs a new Address
+// good for respective instructions by emitting necessary code to calculate
+// the address in tmp register
+Address Address::safe_for(InsnDataType type, MacroAssembler *as, Register tmp) {
+  if (is_safe_for(type))
+    return *this;
+  assert(tmp->is_valid(), "must be");
+  lea(as, tmp);
+  return Address(tmp);
+}
+#undef __
+
+bool Address::is_safe_for(InsnDataType type) {
+  switch (_acc_mode) {
+    case imm:
+    case lit:
+      return offset_ok_for_immed(_offset, type);
+    case reg:
+      return shift_ok_for_index(_shift, type);
+    case no_mode:
+    default:
+      ShouldNotReachHere();
+      return false;
+  }
+}
+
+
+bool Address::offset_ok_for_immed(long offset, InsnDataType type) {
+  const int o = offset < 0 ? -offset : offset;
+  switch (type) {
+  case IDT_INT:
+  case IDT_BOOLEAN:
+  case IDT_OBJECT:
+  case IDT_ADDRESS:
+  case IDT_METADATA:
+  case IDT_ARRAY:
+    return o <= 0xfff;
+  case IDT_BYTE:
+  case IDT_SHORT:
+  case IDT_LONG:
+  case IDT_CHAR:
+    return o <= 0xff;
+  case IDT_FLOAT:
+  case IDT_DOUBLE:
+    return !(o & ~0x3fc);
+  case IDT_LEA:
+    return true;
+  case IDT_ATOMIC:
+  case IDT_MULTIWORD:
+    return !o;
+  default:
+    ShouldNotReachHere();
+    return false;
+  }
+}
+
+bool Address::shift_ok_for_index(shift_op shift, InsnDataType type) {
+  switch (type) {
+  case IDT_INT:
+  case IDT_BOOLEAN:
+  case IDT_OBJECT:
+  case IDT_ADDRESS:
+  case IDT_METADATA:
+  case IDT_ARRAY:
+    return !shift.is_register();
+  case IDT_BYTE:
+  case IDT_SHORT:
+  case IDT_LONG:
+  case IDT_CHAR:
+    return !shift.is_register() && shift.shift() == 0;
+  case IDT_LEA:
+    return true;
+  case IDT_FLOAT:
+  case IDT_DOUBLE:
+  case IDT_ATOMIC:
+  case IDT_MULTIWORD:
+    return false;
+  default:
+    ShouldNotReachHere();
+    return false;
+  }
+}
+
+void Assembler::emit_data64(jlong data,
+                            relocInfo::relocType rtype,
+                            int format) {
+  if (rtype == relocInfo::none) {
+    emit_int64(data);
+  } else {
+    emit_data64(data, Relocation::spec_simple(rtype), format);
+  }
+}
+
+void Assembler::emit_data64(jlong data,
+                            RelocationHolder const& rspec,
+                            int format) {
+
+  assert(inst_mark() != NULL, "must be inside InstructionMark");
+  // Do not use AbstractAssembler::relocate, which is not intended for
+  // embedded words.  Instead, relocate to the enclosing instruction.
+  code_section()->relocate(inst_mark(), rspec, format);
+  emit_int64(data);
+}
+
+extern "C" {
+  void das(uint64_t start, int len) {
+    ResourceMark rm;
+    len <<= 2;
+    if (len < 0)
+      Disassembler::decode((address)start + len, (address)start);
+    else
+      Disassembler::decode((address)start, (address)start + len);
+  }
+
+  JNIEXPORT void das1(unsigned long insn) {
+    das(insn, 1);
+  }
+}
+
+#define starti Instruction_aarch32 do_not_use(this); set_current(&do_not_use)
+
+  void Assembler::adr(Register Rd, address adr, Condition cond) {
+    int offset = adr - pc() - 8;
+    adr_encode(Rd, offset, cond);
+  }
+
+#undef starti
+
+Address::Address(address target, relocInfo::relocType rtype)
+  : _acc_mode(lit), _base(sp), _offset(0), _wb_mode(off) {
+  //TODO we don't complete _wb_mode - what about Addresses that are pre/post accessed?
+  _is_lval = false;
+  _target = target;
+  switch (rtype) {
+  case relocInfo::oop_type:
+  case relocInfo::metadata_type:
+    // Oops are a special case. Normally they would be their own section
+    // but in cases like icBuffer they are literals in the code stream that
+    // we don't have a section for. We use none so that we get a literal address
+    // which is always patchable.
+    break;
+  case relocInfo::external_word_type:
+    _rspec = external_word_Relocation::spec(target);
+    break;
+  case relocInfo::internal_word_type:
+    _rspec = internal_word_Relocation::spec(target);
+    break;
+  case relocInfo::opt_virtual_call_type:
+    _rspec = opt_virtual_call_Relocation::spec();
+    break;
+  case relocInfo::static_call_type:
+    _rspec = static_call_Relocation::spec();
+    break;
+  case relocInfo::runtime_call_type:
+    _rspec = runtime_call_Relocation::spec();
+    break;
+  case relocInfo::poll_type:
+  case relocInfo::poll_return_type:
+    _rspec = Relocation::spec_simple(rtype);
+    break;
+  case relocInfo::none:
+    _rspec = RelocationHolder::none;
+    break;
+  default:
+    ShouldNotReachHere();
+    break;
+  }
+}
+
+#ifdef COMPILER2
+Address Address::make_raw(int base, int index, int scale, unsigned long o, relocInfo::relocType disp_reloc)  {
+  RelocationHolder rspec;
+  if (disp_reloc != relocInfo::none) {
+    rspec = Relocation::spec_simple(disp_reloc);
+  }
+  if (as_Register(index) == r15_pc) {
+    assert(scale == 0, "unsupported");
+    Address a(as_Register(base), o);
+    a._rspec = rspec;
+    return a;
+  } else {
+    assert(o == 0, "unsupported");
+    Address a(as_Register(base), as_Register(index), lsl(scale));
+    a._rspec = rspec;
+    return a;
+  }
+}
+#endif
+
+void Assembler::adr(Register r, const Address &dest, Condition cond) {
+  code_section()->relocate(pc(), dest.rspec());
+  adr(r, dest.target());
+}
+
+void Assembler::wrap_label(Label &L, Assembler::uncond_branch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(target(L));
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(pc());
+  }
+}
+void Assembler::wrap_label(Label &L, Condition cond,
+                           Assembler::cond_branch_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(target(L), cond);
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(pc(), cond);
+  }
+}
+
+void Assembler::wrap_label(Register r, Label &L, Condition cond,
+                           Assembler::cond_ldst_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(r, target(L), cond);
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(r, pc(), cond);
+  }
+}
+
+
+void Assembler::wrap_label(FloatRegister r, Label &L, Condition cond,
+                           Assembler::cond_fp_ldst_insn insn) {
+  if (L.is_bound()) {
+    (this->*insn)(r, target(L), cond);
+  } else {
+    L.add_patch_at(code(), locator());
+    (this->*insn)(r, pc(), cond);
+  }
+}
+
+
+uint32_t Assembler::encode_imm12(int imm) {
+  assert(is_valid_for_imm12(imm),
+         "only valid immediates allowed, call is_valid_for_imm12 first");
+  uint32_t n = imm;
+  if ((n & 0xFFFFFF00) == 0) {
+    return n;
+  }
+  if ((n & 0xFC000000) == 0) {
+    const int lshift = __builtin_ctz(n) & 0xFFFFFFFE;
+    return ((32 - lshift) << 7) | (n >> lshift);
+  }
+  n = (n << 16) | (n >> 16);
+  const int lshift = __builtin_ctz(n) & 0xFFFFFFFE;
+  return ((16 - lshift) << 7) | (n >> lshift);
+}
+
+int Assembler::decode_imm12(uint32_t imm12) {
+  assert((imm12 & 0xFFFFF000) == 0, "bad imm12");
+  uint32_t shift = (imm12 & 0x00000F00) >> 7;
+  uint32_t value = imm12 & 0x000000FF;
+  return (int) ((value >> shift) | (value << (32 - shift)));
+}
+
+bool Assembler::is_valid_for_imm12(int imm) {
+  uint32_t n = (uint32_t) imm;
+  uint32_t shift = __builtin_clz(n) & 0xFFFFFFFE;
+  uint32_t result = n << shift;
+  if ((result & 0x00FFFFFF) == 0) {
+    return true;
+  }
+  n = (n << 16) | (n >> 16);
+  shift = __builtin_clz(n) & 0xFFFFFFFE;
+  result = n << shift;
+  if ((result & 0x00FFFFFF) == 0) {
+    return true;
+  }
+  return false;
+}
+
+bool Assembler::operand_valid_for_logical_immediate(bool is32, uint64_t imm) {
+  return is32 && is_valid_for_imm12(imm);
+}
+
+bool Assembler::operand_valid_for_add_sub_immediate(int imm) {
+  return is_valid_for_imm12(imm);
+}
+
+bool Assembler::operand_valid_for_add_sub_immediate(unsigned long imm) {
+  return is_valid_for_imm12(imm);
+}
+
+bool Assembler::operand_valid_for_add_sub_immediate(unsigned imm) {
+  return is_valid_for_imm12(imm);
+}
+
+bool Assembler::operand_valid_for_add_sub_immediate(jlong imm) {
+  return is_valid_for_imm12(imm >> 32) && is_valid_for_imm12(imm);
+}
+
+// n.b. this is implemented in subclass MacroAssembler
+void Assembler::bang_stack_with_offset(int offset) { Unimplemented(); }
+
+int AbstractAssembler::code_fill_byte() {
+  return 0;
+}
+
+void Assembler::mov_immediate(Register dst, uint32_t imm32, Condition cond, bool s) {
+#ifndef PRODUCT
+    {
+      char buffer[64];
+      snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
+      block_comment(buffer);
+    }
+#endif
+  if(is_valid_for_imm12(imm32)) {
+    if(s) movs_i(dst, (unsigned)imm32, cond);
+    else  mov_i (dst, (unsigned)imm32, cond);
+  } else if(is_valid_for_imm12(~imm32)) {
+    if(s) mvns_i(dst, (unsigned)~imm32, cond);
+    else  mvn_i (dst, (unsigned)~imm32, cond);
+  } else if (!s && VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2) &&
+             (imm32 < (1 << 16))) {
+    movw_i(dst, (unsigned)imm32, cond);
+  } else if (!s && VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2) &&
+             !(imm32 & ((1 << 16) - 1))) {
+    movw_i(dst, (unsigned)0, cond);
+    movt_i(dst, (unsigned)(imm32 >> 16), cond);
+  } else { // TODO Could expand to varied numbers of mov and orrs
+    //Need to do a full 32 bits
+    mov_immediate32(dst, imm32, cond, s);
+  }
+}
+
+//This should really be in the macroassembler
+void Assembler::mov_immediate32(Register dst, uint32_t imm32, Condition cond, bool s)
+{
+    // Need to move a full 32 bit immediate, for example if we're loading an address that
+    // might change later and therefore need to be updated.
+  if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))  {
+    //Use a movw and a movt
+    Assembler::movw_i(dst, (unsigned)(imm32 & 0xffff), cond);
+    Assembler::movt_i(dst, (unsigned)(imm32 >> 16), cond);
+    if(s) {
+      //Additionally emit a cmp instruction
+      Assembler::cmp(dst, 0);
+    }
+  } else {
+    // Sadly we don't have movw, movt
+    // instead emit a mov and three orr
+    mov_i(dst,    imm32 & (0xff      ), cond);
+    orr(dst, dst, imm32 & (0xff << 8 ), cond);
+    orr(dst, dst, imm32 & (0xff << 16), cond);
+    if(s) orrs(dst, dst, imm32 & (0xff << 24), cond);
+    else  orr (dst, dst, imm32 & (0xff << 24), cond);
+  }
+}
+
+#define starti Instruction_aarch32 do_not_use(this); set_current(&do_not_use)
+void Assembler::add_sub_imm(int decode, Register Rd, Register Rn, int imm,
+                            Condition cond, bool s) {
+  int cpart = 0;
+  switch(decode) {
+    case 0b0100: cpart = 0b0010; break; // ADD  ->  SUB
+    case 0b0010:                        // SUB  ->  ADD
+    case 0b0011: cpart = 0b0100; break; // RSB  ->  ADD
+    case 0b0101: cpart = 0b0110; break; // ADC  ->  SUBC
+    case 0b0110:                        // SUBC ->  ADC
+    case 0b0111: cpart = 0b0101; break; // RSC  ->  ADC
+    default: ShouldNotReachHere();
+  }
+  //try both possible imm_instrs
+  if(imm_instr(decode, Rd, Rn, imm, cond, s)) return;
+  if(imm_instr(cpart, Rd, Rn, -imm, cond, s)) return;
+
+  //Try plan B - a mov first - need to have destination that is not an arg
+  assert(Rd != Rn, "Can't use imm and can't do a mov. I'm in a jam.");
+  mov_immediate(Rd, (uint32_t)uabs(imm), cond, s);
+  //Now do the non immediate version - copied from the immediate encodings
+  {
+    starti;
+    reg_instr( imm < 0 ? cpart : decode, lsl(), cond, s);
+    rf(Rn, 16), rf(Rd, 12), rf(Rd, 0);
+  }
+}
+
+bool Assembler::can_ldst_multiple( unsigned regset, const Address& adr) {
+  int nbits = count_bits(regset);
+  return adr.get_mode() == Address::imm &&
+         !(adr.base()->bit() & regset) && // FIXME, this could be relaxed
+         (((adr.offset() == 0 || adr.offset() == wordSize || adr.offset() == -nbits * wordSize) &&
+           (adr.get_wb_mode() == Address::pre || adr.get_wb_mode() == Address::off)) ||
+          ((adr.offset() == 0 || adr.offset() == -wordSize || adr.offset() == nbits * wordSize) &&
+           adr.get_wb_mode() == Address::post));
+}
+
+unsigned Assembler::count_bits(unsigned val) {
+  unsigned i, count;
+  for(i = 0, count = 0; i < 8 * sizeof(val); val >>= 1, i++)
+    if( val & 1 ) count++;
+  return count;
+}
+
+
+void Assembler::vmov_imm(FloatRegister Rd, unsigned imm, bool is64bit,
+                         Condition cond) {
+  starti;
+  fp_instr_base(is64bit, cond);
+  f(0b1011, 23, 20);
+  // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+  fp_rencode(Rd, false, 12, 22);
+  f(0b0000, 7, 4);
+  f(imm & 0xf, 3, 0);
+  f(imm >> 4, 19, 16);
+}
+
+void Assembler::vmov_imm(FloatRegister Rd, unsigned imm) {
+  assert(operand_valid_for_double_immediate(0), "operand should be valid for immediate");
+  int cmod = 0b0000;
+  {
+    starti;
+    f(0b1111001, 31, 25);
+    f(0, 24); // imm1
+    f(0b10000, 23, 19);
+    // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+    fp_rencode(Rd, false, 12, 22);
+    f(cmod, 11, 8);
+    f(0b000, 7, 5);
+    f(1, 4);
+    f(imm & 0xf, 3, 0);  //imm4
+    f(imm >> 4, 18, 16); //imm3
+  }
+}
+
+void Assembler::vmov_imm_zero(FloatRegister Rd, bool is64bit,
+                              Condition cond) {
+  // Note that this is not a floating point vmov but instead
+  // an integer vmov from the SIMD instructions.
+  // cannot be conditional.
+  assert(operand_valid_for_double_immediate(0), "operand should be valid for immediate");
+  assert(is64bit, "SIMD loading available only for double registers");
+  assert(cond == C_DFLT, "Unable to vmov #0 conditionally");
+  //int cmod = is64bit? 0b1110 : 0b0000; // ? I64 : I32
+  int cmod = 0b1110;
+  {
+    starti;
+    f(0b1111001, 31, 25);
+    f(0, 24); // imm1
+    f(0b10000, 23, 19);
+    // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+    fp_rencode(Rd, false, 12, 22);
+    f(0b000, 18, 16); //imm3
+    f(cmod, 11, 8);
+    f(0b00, 7, 6);
+    f(is64bit, 5);
+    f(1, 4);
+    f(0b0000, 3, 0); //imm4
+  }
+}
+
+bool Assembler::operand_valid_for_float_immediate(float v) {
+    if (!(VM_Version::features() & FT_VFPV3)) {
+        return false;
+    }
+    union ufloat {
+        float f;
+        uint32_t u;
+    } imm;
+    unsigned tmp;
+    imm.f = v;
+
+    if (imm.u & ((1 << 19) - 1))
+        return false;
+
+    tmp = (imm.u >> 25) & ((1 << 6) - 1);
+    return tmp == 32 || tmp == 31;
+}
+
+bool Assembler::operand_valid_for_double_immediate(double v) {
+    if (!(VM_Version::features() & FT_VFPV3)) {
+        return false;
+    }
+    union ufloat {
+        double f;
+        uint64_t u;
+    } imm;
+    unsigned tmp;
+    imm.f = v;
+
+    if ((VM_Version::features() & FT_AdvSIMD) && imm.u == 0)
+        return true;
+
+    if (imm.u & (uint64_t) 0xffffffffffffLL)
+        return false;
+
+    imm.u >>= 48;
+
+    tmp = (imm.u >> 6) & ((1 << 9) - 1);
+    return tmp == 0x100 || tmp == 0xff;
+}
+
+unsigned Assembler::encode_float_fp_imm(float imm_f) {
+  assert(operand_valid_for_float_immediate(imm_f), "operand should be valid for immediate");
+  union ufloat {
+    float f;
+    uint32_t u;
+  } imm;
+  unsigned tmp, imm8;
+  imm.f = imm_f;
+
+  assert(!(imm.u & ((1 << 19) - 1)), "Invalid float imm");
+  tmp = (imm.u >> 25) & ((1 << 6) - 1);
+  assert(tmp == 32 || tmp == 31, "Invalid float imm");
+
+  imm8 = (imm.u >> 24) & 0x80; // set a
+  imm8 |= (imm.u >> 19) & 0x7F; // set bcdefgh
+  return imm8;
+}
+
+unsigned Assembler::encode_double_fp_imm(double imm_f) {
+  assert(operand_valid_for_double_immediate(imm_f), "operand should be valid for immediate");
+  union ufloat {
+    double f;
+    uint64_t u;
+  } imm;
+  unsigned tmp, imm8;
+  imm.f = imm_f;
+
+  assert(!(imm.u & (uint64_t)0xffffffffffffLL), "Invalid float imm");
+  imm.u >>= 48;
+
+  tmp = (imm.u >> 6) & ((1 << 9) - 1);
+  assert(tmp == 0x100 || tmp == 0xff, "Invalid float imm");
+
+  imm8 = (imm.u >> 8) & 0x80; // set a
+  imm8 |= imm.u & 0x7F; // set bcdefgh
+  return imm8;
+}
+
+
+void Assembler::fp_ldst_instr(int decode, bool is64bit, const Address& adr,
+                              Condition cond) {
+  f(cond, 31, 28), f(0b110, 27, 25), f(decode, 24, 20);
+  f(0b101, 11, 9), f(is64bit, 8);
+  adr.fp_encode(current, code_section(), pc());
+}
+
+void Assembler::fp_ldst_mul(Register Rn, uint32_t regset, bool load, bool is64bit,
+                            enum fp_mode mode, Condition cond) {
+  starti;
+  bool P = db_wb == mode;
+  bool U = ia_wb == mode || ia == mode;
+  bool W = ia_wb == mode || db_wb == mode;
+  // Encode registers
+  unsigned i, fp_first_reg, nregs = 1;
+  bool enc_z = false;
+  for(fp_first_reg = 0; !(regset & 1); regset >>= 1, fp_first_reg++);
+  FloatRegister Rd = (FloatRegister) fp_first_reg;
+  for(i = 0; i + fp_first_reg < 8 * sizeof(int); i++) {
+    regset >>= 1;
+    if(regset & 1) {
+      assert(!enc_z, "Unable to encode non-consecutive registers in fp_ldst_mul");
+      nregs++;
+    } else {
+      enc_z = true;
+    }
+  }
+  assert(!is64bit || nregs <= 16, "Too many registers in a set");
+  f(cond, 31, 28), f(0b110, 27, 25); f(P, 24), f(U, 23), f(W, 21), f(load, 20);
+  // vstm/vstm uses double register number, not it's encoding. Should reencode it.
+  rf(Rn, 16), fp_rencode(Rd, is64bit, 12, 22), f(0b101, 11, 9), f(is64bit, 8);
+  f(is64bit ? nregs * 2 : nregs, 7, 0);
+}
+
+void Assembler::simd_ldst(FloatRegister Rd, unsigned type, unsigned size, unsigned num_regs,
+        const Address &addr, enum SIMD_Align align, unsigned encode) {
+  starti;
+  assert(addr.get_mode() == Address::imm &&
+          (addr.get_wb_mode() == Address::off && addr.offset() == 0) ||
+          (addr.get_wb_mode() == Address::post && addr.offset() == long(8*num_regs)), "Unsupported");
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  f(0b1111, 31, 28), f(0b0100, 27, 24), f(0, 23), f(encode, 21, 20);
+  rf(addr.base(), 16), fp_rencode(Rd, false, 12, 22), f(type, 11, 8), f(size, 7, 6);
+  f((unsigned)align, 5, 4), f(addr.get_wb_mode() == Address::post ? 0b1101 : 0b1111, 3, 0);
+}
+
+void Assembler::simd_ldst_single(FloatRegister Rd, unsigned size, unsigned index,
+        const Address &addr, bool align, unsigned encode) {
+  starti;
+  assert(addr.get_mode() == Address::imm &&
+          (addr.get_wb_mode() == Address::off && addr.offset() == 0) ||
+          (addr.get_wb_mode() == Address::post && addr.offset() == long(1<<size)), "Unsupported");
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  f(0b1111, 31, 28), f(0b0100, 27, 24), f(1, 23), f(encode, 21, 20);
+  rf(addr.base(), 16), fp_rencode(Rd, false, 12, 22), f(0b00, 9, 8), f(size, 11, 10);
+  f((index<<(size+1))|(size&&align?1:0), 7, 4), f(addr.get_wb_mode() == Address::post ? 0b1101 : 0b1111, 3, 0);
+}
+
+void Assembler::simd_vmov(FloatRegister Dd, unsigned index, Register Rt, bool advsimd,
+          unsigned index_bits, unsigned bit20, unsigned opc, Condition cond) {
+  starti;
+  assert(index < (1u<<index_bits), "Illegal element index");
+  assert(!advsimd || (VM_Version::features() & FT_AdvSIMD), "SIMD coprocessor required");
+  opc |= index << (3 - index_bits);
+  f(cond, 31, 28), f(0b1110, 27, 24), f((opc>>2)&3, 22, 21), f(bit20, 20);
+  fp_rencode(Dd, false, 16, 7), f(opc>>4, 23);
+  rf(Rt, 12), f(0b1011, 11, 8), f(opc & 3, 6, 5), f(0b10000, 4, 0);
+}
+
+void Assembler::simd_logicalop(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, unsigned q,
+        unsigned a, unsigned b, unsigned u, unsigned c) {
+  starti;
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b1111001, 31, 25), f(u, 24), f(0, 23), f(c, 21, 20), fp_rencode(Dd, false, 12, 22);
+  fp_rencode(Dn, false, 16, 7), f(a, 11, 8), fp_rencode(Dm, false, 0, 5), f(q, 6), f(b, 4);
+}
+
+void Assembler::simd_vmul(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+          unsigned bit24, unsigned bits109, unsigned size, unsigned mul, unsigned bit6) {
+  starti;
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  f(0b1111001, 31, 25), f(bit24, 24), f(size, 21, 20), fp_rencode(Dd, false, 12, 22);
+  f(mul^1, 23), fp_rencode(Dn, false, 16, 7), f(1, 11), f(bits109, 10, 9);
+  f(mul, 8), f(bit6, 6), f(mul, 4), fp_rencode(Dm, false, 0, 5);
+}
+
+void Assembler::simd_vuzp(FloatRegister Dd, FloatRegister Dm, unsigned size, unsigned q) {
+  starti;
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b11, 21, 20), f(size, 19, 18);
+  f(0b10, 17, 16), f(0b00010, 11, 7), f(q, 6), f(0, 4), fp_rencode(Dm, false, 0, 5);
+}
+
+void Assembler::simd_vshl(FloatRegister Dd, FloatRegister Dm, unsigned imm,
+        unsigned q, unsigned u, unsigned encode) {
+  starti;
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b1111001, 31, 25), f(u, 24), f(1, 23), fp_rencode(Dd, false, 12, 22);
+  f(imm & 0b111111, 21, 16), f(imm >> 6, 7), f(q, 6);
+  f(encode, 11, 8), fp_rencode(Dm, false, 0, 5), f(1, 4);
+}
+
+void Assembler::simd_vshl(FloatRegister Dd, FloatRegister Dm, FloatRegister Dn, unsigned size,
+        unsigned q, unsigned u) {
+  starti;
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b1111001, 31, 25), f(u, 24), f(0b0, 23), f(size, 21, 20), fp_rencode(Dn, false, 16, 7);
+  fp_rencode(Dd, false, 12, 22), f(0b0100, 11, 8), f(q, 6), fp_rencode(Dm, false, 0, 5), f(0, 4);
+}
+
+// Two registers miscellaneous
+void Assembler::simd_insn(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned a,
+          unsigned b, unsigned size) {
+  starti;
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b11, 21, 20);
+  f(size, 19, 18), f(a, 17, 16), f(0b0, 11), f(b, 10, 6);
+  fp_rencode(Dm, false, 0, 5), f(0, 4);
+}
+
+void Assembler::simd_cnt(FloatRegister Dd, FloatRegister Dm, unsigned q) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  starti;
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b110000, 21, 16);
+  f(0b01010, 11, 7), f(q, 6), fp_rencode(Dm, false, 0, 5), f(0b0, 4);
+}
+
+void Assembler::simd_padl(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned size,
+        unsigned op, unsigned encode) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  starti;
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  assert(size != 0b11, "unsupported");
+  f(0b111100111, 31, 23), f(0b11, 21, 20), f(0b00, 17, 16), f(0b0, 11);
+  fp_rencode(Dd, false, 12, 22), f(0b0, 4), fp_rencode(Dm, false, 0, 5);
+  f(size, 19, 18), f(op, 7), f(q, 6), f(encode, 10, 8);
+}
+
+void Assembler::simd_dup(FloatRegister Dd, Register Rt, unsigned q, unsigned size) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(size != 0b11, "must be");
+  assert(!q || (Dd->encoding() & 2) == 0, "Odd register");
+  starti;
+  f(0b111011101, 31, 23), f(size >> 1, 22), f(q, 21), f(0, 20), fp_rencode(Dd, false, 16, 7);
+  rf(Rt, 12), f(0b1011, 11, 8), f(size & 1, 6, 5), f(0b10000, 4, 0);
+}
+
+void Assembler::simd_dup(FloatRegister Dd, FloatRegister Dm, unsigned index, unsigned q, unsigned size) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(size != 0b11, "must be");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  starti;
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b11, 21, 20);
+  f(((index<<1)|1)<<(2-size), 19, 16), f(0b11000, 11, 7), f(q, 6), fp_rencode(Dm, false, 0, 5), f(0b0, 4);
+}
+
+void Assembler::simd_neg(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned size) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(size != 0b11, "must be");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd registers");
+  starti;
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b11, 21, 20), f(size, 19, 18);
+  f(0b01, 17, 16), f(0b00111, 11, 7), f(q, 6), fp_rencode(Dm, false, 0, 5), f(0b0, 4);
+}
+
+void Assembler::simd_vmov(FloatRegister Dd, unsigned imm, unsigned q, unsigned op_cmode) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || (Dd->encoding() & 2) == 0, "Odd register");
+  assert(!(imm >> 8), "must be imm8");
+  starti;
+  f(0b1111001, 31, 25), f(imm>>7, 24), f(0b1, 23), fp_rencode(Dd, false, 12, 22);
+  f(0b000, 21, 19), f((imm>>4)&0x7, 18, 16), f(op_cmode&0xf, 11, 8), f(0b0, 7);
+  f(q, 6); f(op_cmode>>4, 5), f(0b1, 4), f(imm&0xf, 3, 0);
+}
+
+void Assembler::simd_insn(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+        unsigned q, unsigned a, unsigned b, unsigned u, unsigned c) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || (Dd->encoding() & 2) == 0, "Odd register");
+  starti;
+  f(0b1111001, 31, 25), f(u, 24), f(0b0, 23), f(c, 21, 20), f(a, 11, 8), f(b, 4), f(q, 6);
+  fp_rencode(Dn, false, 16, 7), fp_rencode(Dd, false, 12, 22), fp_rencode(Dm, false, 0, 5);
+}
+
+void Assembler::simd_mvn(FloatRegister Dd, FloatRegister Dm, unsigned q) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  starti;
+  f(0b111100111, 31, 23), fp_rencode(Dd, false, 12, 22), f(0b11, 21, 20), f(0b00, 19, 18);
+  f(0b00, 17, 16), f(0b01011, 11, 7), f(q, 6), fp_rencode(Dm, false, 0, 5), f(0b0, 4);
+}
+
+void Assembler::simd_insn(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+        unsigned qn, unsigned a, unsigned b, unsigned u) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert((Dd->encoding() & 2) == 0, "Odd register");
+  assert(!qn || (Dn->encoding() & 2) == 0, "Odd operand register");
+  starti;
+  f(0b1111001, 31, 25), f(u, 24), f(0b1, 23), f(b, 21, 20), f(a, 11, 8), f(0b0, 4), f(0b0, 6);
+  fp_rencode(Dn, false, 16, 7), fp_rencode(Dd, false, 12, 22), fp_rencode(Dm, false, 0, 5);
+}
+
+void Assembler::simd_vext(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, unsigned q, unsigned imm) {
+  assert(VM_Version::features() & FT_AdvSIMD, "SIMD coprocessor required");
+  assert(!q || ((Dd->encoding() & 2) == 0 && (Dn->encoding() & 2) == 0 && (Dm->encoding() & 2) == 0), "Odd register");
+  starti;
+  f(0b111100101, 31, 23), f(0b11, 21, 20), f(imm, 11, 8), f(q, 6), f(0b0, 4);
+  fp_rencode(Dn, false, 16, 7), fp_rencode(Dd, false, 12, 22), fp_rencode(Dm, false, 0, 5);
+}
+
+#undef starti
--- /dev/null	2018-09-25 19:24:11.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/assembler_aarch32.hpp	2018-09-25 19:24:11.000000000 +0300
@@ -0,0 +1,2612 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_ASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_ASSEMBLER_AARCH32_HPP
+
+#include "asm/register.hpp"
+#include "vm_version_aarch32.hpp"
+
+// Definitions of various symbolic names for machine registers
+
+// Here we define how many integer and double precision floating point
+// registers are used for passing parameters by the C and Java calling
+// conventions. Each double precision floating point register can be used
+// as two single precision registers.
+
+class Argument {
+ public:
+  enum {
+    n_int_register_parameters_c = 4,   // c_rarg0, c_rarg1, c_rarg2, c_rarg3
+#ifdef HARD_FLOAT_CC
+    n_float_register_parameters_c = 8, // c_farg0, c_farg1, ..., c_farg7
+#else // HARD_FLOAT_CC
+    n_float_register_parameters_c = 0, // 0 registers used to pass arguments
+#endif // HARD_FLOAT_CC
+    n_int_register_parameters_j = 4,   // j_rarg0, j_rarg1, j_rarg2, j_rarg3
+#ifdef HARD_FLOAT_CC
+    n_float_register_parameters_j = 8  // j_farg0, j_farg1, ..., j_farg7
+#else // HARD_FLOAT_CC
+    n_float_register_parameters_j = 0  // 0 registers used to pass arguments
+#endif // HARD_FLOAT_CC
+  };
+};
+
+// Symbolic names for the register arguments used by the C calling convention
+// (the calling convention for C runtime calls and calls to JNI native
+// methods)
+
+REGISTER_DECLARATION(Register, c_rarg0, r0);
+REGISTER_DECLARATION(Register, c_rarg1, r1);
+REGISTER_DECLARATION(Register, c_rarg2, r2);
+REGISTER_DECLARATION(Register, c_rarg3, r3);
+
+// Symbolic names for the register arguments used by the Java calling
+// convention (the calling convention for calls to compiled Java methods)
+
+// In contrary to most ports we don't shift java argument registers by 1. Although
+// it helps to avoid extra argument copy when invoking JNI methods it brings a
+// lot more complexity into C2 port and prevents from using ldrd/strd instructions
+// when dealing with jlong values.
+//
+//  |-----------------------------------|
+//  | c_rarg0  c_rarg1  c_rarg2 c_rarg3 |
+//  |-----------------------------------|
+//  | r0       r1       r2      r3      |
+//  |-----------------------------------|
+//  | j_rarg0  j_rarg1  j_rarg2 j_rarg3 |
+//  |-----------------------------------|
+
+REGISTER_DECLARATION(Register, j_rarg0, c_rarg0);
+REGISTER_DECLARATION(Register, j_rarg1, c_rarg1);
+REGISTER_DECLARATION(Register, j_rarg2, c_rarg2);
+REGISTER_DECLARATION(Register, j_rarg3, c_rarg3);
+
+// Common register aliases used in assembler code
+
+// These registers are used to hold VM data either temporarily within a method
+// or across method calls. According to AAPCS, r0-r3 and r12 are caller-saved,
+// the rest are callee-saved.
+
+// These 4 aliases are used in the template interpreter only.
+
+REGISTER_DECLARATION(Register, rdispatch, r4);  // Address of dispatch table
+REGISTER_DECLARATION(Register, rbcp,      r5);  // Bytecode pointer
+REGISTER_DECLARATION(Register, rlocals,   r6);  // Address of local variables section of current frame
+REGISTER_DECLARATION(Register, rcpool,    r7);  // Address of constant pool cache
+
+// The following aliases are used in all VM components.
+
+REGISTER_DECLARATION(Register, rmethod,   r8);  // Address of current method
+REGISTER_DECLARATION(Register, rscratch1, r9);  // Scratch register
+REGISTER_DECLARATION(Register, rthread,   r10); // Address of current thread
+REGISTER_DECLARATION(Register, rfp,       r11); // Frame pointer
+REGISTER_DECLARATION(Register, rscratch2, r12); // Scratch register
+REGISTER_DECLARATION(Register, sp,        r13); // Stack pointer
+REGISTER_DECLARATION(Register, lr,        r14); // Link register
+REGISTER_DECLARATION(Register, r15_pc,    r15); // Program counter
+
+
+extern "C" void entry(CodeBuffer *cb);
+
+
+#define assert_cond(ARG1) assert(ARG1, #ARG1)
+
+class Assembler;
+
+class Instruction_aarch32 {
+  unsigned insn;
+#ifdef ASSERT
+  unsigned bits;
+#endif
+  Assembler *assem;
+
+public:
+
+  Instruction_aarch32(class Assembler *as) {
+#ifdef ASSERT
+    bits = 0;
+#endif
+    insn = 0;
+    assem = as;
+  }
+
+  inline ~Instruction_aarch32();
+
+  unsigned &get_insn() { return insn; }
+#ifdef ASSERT
+  unsigned &get_bits() { return bits; }
+#endif
+
+  static inline int32_t extend(unsigned val, int hi = 31, int lo = 0) {
+    union {
+      unsigned u;
+      int n;
+    };
+
+    u = val << (31 - hi);
+    n = n >> (31 - hi + lo);
+    return n;
+  }
+
+  static inline uint32_t extract(uint32_t val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    assert_cond(msb >= lsb);
+    uint32_t mask = (1U << nbits) - 1;
+    uint32_t result = val >> lsb;
+    result &= mask;
+    return result;
+  }
+
+  static inline int32_t sextract(uint32_t val, int msb, int lsb) {
+    uint32_t uval = extract(val, msb, lsb);
+    return extend(uval, msb - lsb);
+  }
+
+  static void patch(address a, int msb, int lsb, unsigned long val) {
+    int nbits = msb - lsb + 1;
+    guarantee(val < (1U << nbits), "Field too big for insn");
+    assert_cond(msb >= lsb);
+    unsigned mask = (1U << nbits) - 1;
+    val <<= lsb;
+    mask <<= lsb;
+    unsigned target = *(unsigned *)a;
+    target &= ~mask;
+    target |= val;
+    *(unsigned *)a = target;
+  }
+
+  static void spatch(address a, int msb, int lsb, long val) {
+    int nbits = msb - lsb + 1;
+    long chk = val >> (nbits - 1);
+    guarantee (chk == -1 || chk == 0, "Field too big for insn");
+    unsigned uval = val;
+    unsigned mask = (1U << nbits) - 1;
+    uval &= mask;
+    uval <<= lsb;
+    mask <<= lsb;
+    unsigned target = *(unsigned *)a;
+    target &= ~mask;
+    target |= uval;
+    *(unsigned *)a = target;
+  }
+
+/*  void f(unsigned val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    guarantee(val < (1U << nbits), "Field too big for insn");
+    assert_cond(msb >= lsb);
+    unsigned mask = (1U << nbits) - 1;
+    val <<= lsb;
+    mask <<= lsb;
+    insn |= val;
+    assert_cond((bits & mask) == 0);
+#ifdef ASSERT
+    bits |= mask;
+#endif
+  }*/
+
+  void f(unsigned val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    guarantee(val < (1U << nbits), "Field too big for insn");
+    assert_cond(msb >= lsb);
+    unsigned mask = (1U << nbits) - 1;
+    val <<= lsb;
+    mask <<= lsb;
+    insn &= ~mask;
+    insn |= val;
+#ifdef ASSERT
+    bits |= mask;
+#endif
+  }
+
+  void f(unsigned val, int bit) {
+    f(val, bit, bit);
+  }
+
+  void sf(long val, int msb, int lsb) {
+    int nbits = msb - lsb + 1;
+    long chk = val >> (nbits - 1);
+    guarantee (chk == -1 || chk == 0, "Field too big for insn");
+    unsigned uval = val;
+    unsigned mask = (1U << nbits) - 1;
+    uval &= mask;
+    f(uval, lsb + nbits - 1, lsb);
+  }
+
+  void rf(Register r, int lsb) {
+    f(r->encoding_nocheck(), lsb + 3, lsb);
+  }
+
+  void rf(FloatRegister r, int lsb) {
+    f(r->encoding_nocheck(), lsb + 4, lsb);
+  }
+
+  unsigned get(int msb = 31, int lsb = 0) {
+    int nbits = msb - lsb + 1;
+    unsigned mask = ((1U << nbits) - 1) << lsb;
+    assert_cond((bits & mask) == mask);
+    return (insn & mask) >> lsb;
+  }
+
+  void fixed(unsigned value, unsigned mask) {
+    assert_cond ((mask & bits) == 0);
+#ifdef ASSERT
+    bits |= mask;
+#endif
+    insn |= value;
+  }
+};
+
+#define starti Instruction_aarch32 do_not_use(this); set_current(&do_not_use)
+
+// abs methods which cannot overflow and so are well-defined across
+// the entire domain of integer types.
+static inline unsigned int uabs(unsigned int n) {
+  union {
+    unsigned int result;
+    int value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(unsigned long n) {
+  union {
+    unsigned long result;
+    long value;
+  };
+  result = n;
+  if (value < 0) result = -result;
+  return result;
+}
+static inline unsigned long uabs(long n) { return uabs((unsigned long)n); }
+static inline unsigned long uabs(int n) { return uabs((unsigned int)n); }
+
+#define S_DFLT ::lsl()
+#define C_DFLT AL
+
+
+// Shift for base reg + reg offset addressing
+class shift_op {
+ public:
+  enum shift_kind { LSL, LSR, ASR, ROR };
+ private:
+  enum shift_source { imm_s, reg_s };
+  enum shift_source _source;
+  enum shift_kind _op;
+  int _shift;
+  Register _reg;
+
+  bool check_valid() {
+    if(imm_s == _source) {
+      switch(_op) {
+        case LSL: return _shift >= 0 && _shift <= 31;
+        case ROR: return _shift >= 1 && _shift <= 32;
+        default:  return _shift >= 1 && _shift <= 32;
+      }
+    }
+    return true; //Don't check register shifts
+  }
+ public:
+  // Default shift is lsl(0)
+  shift_op()
+    : _source(imm_s), _op(LSL), _shift(0) { }
+  shift_op(enum shift_kind op, int shift)
+    : _source(imm_s), _op(op), _shift(shift) {
+    if(!shift) {
+      // All zero shift encodings map to LSL 0
+      _shift = 0;
+      _op = LSL;
+    }
+    int pshift = _shift;
+    if(-1 == _shift && ROR == _op) {
+      // This is an RRX, make shift valid for the check
+      _shift = 1;
+      pshift = 0; //set to zero
+    }
+    assert(check_valid(), "Invalid shift quantity");
+    _shift = pshift; //restore shift
+  }
+  shift_op(enum shift_kind op, Register r)
+    : _source(reg_s), _op(op), _reg(r) {}
+
+  shift_kind kind() const {
+    return _op;
+  }
+
+  int shift() const {
+    assert(imm_s == _source, "Not an immediate shift");
+    return _shift % 32;
+  }
+  Register reg() const {
+    assert(reg_s == _source, "Not a register shift");
+    return _reg;
+  }
+  bool is_register() {
+    return reg_s == _source;
+  }
+  bool operator==(const shift_op& other) const {
+    if(imm_s == _source && imm_s == other._source) {
+      return _op == other._op && _shift == other._shift;
+    } else if (reg_s == _source && imm_s == _source) {
+      return _op == other._op && _reg == other._reg;
+    }
+    return false;
+  }
+  bool operator!=(const shift_op& other) const {
+    return !( *this == other);
+  }
+};
+class lsl : public shift_op {
+ public:
+  lsl(int sft = 0): shift_op(LSL, sft) { }
+  lsl(Register r): shift_op(LSL, r) { }
+};
+class lsr : public shift_op {
+ public:
+  lsr(int sft = 0): shift_op(LSR, sft) { }
+  lsr(Register r): shift_op(LSR, r) { }
+};
+class asr : public shift_op {
+ public:
+  asr(int sft = 0): shift_op(ASR, sft) { }
+  asr(Register r): shift_op(ASR, r) { }
+};
+class ror : public shift_op {
+ public:
+  ror(int sft = 0): shift_op(ROR, sft) {}
+  ror(Register r): shift_op(ROR, r) { }
+};
+class rrx : public shift_op {
+ public:
+  rrx(): shift_op(ROR, -1) {}
+};
+
+
+// Addressing modes
+class Address {
+ public:
+  enum access_mode { no_mode, imm, reg, lit };
+  //literal is class of imm? -> potentially have to split later if some instructions work
+  // with one but not other although can be determined from registers.
+  enum wb_mode { off, pre, post };
+
+  enum reg_op { ADD, SUB };
+
+ private:
+  Register _base;
+  Register _index;
+  int _offset;
+  enum access_mode _acc_mode;
+  enum wb_mode _wb_mode;
+  enum reg_op _as_op;
+  shift_op _shift;
+
+  RelocationHolder _rspec;
+
+  // Typically we use AddressLiterals we want to use their rval
+  // However in some situations we want the lval (effect address) of
+  // the item.  We provide a special factory for making those lvals.
+  bool _is_lval;
+
+  // If the target is far we'll need to load the ea of this to a
+  // register to reach it. Otherwise if near we can do PC-relative
+  // addressing.
+  address _target;
+
+ public:
+  Address()
+    : _acc_mode(no_mode) { }
+  //immediate & literal
+  Address(Register r, enum wb_mode mode = off)
+    : _base(r), _index(noreg), _offset(0), _acc_mode(imm), _wb_mode(mode),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+  Address(Register r, int o, enum wb_mode mode = off)
+    : _base(r), _index(noreg), _offset(o), _acc_mode(imm), _wb_mode(mode),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+  Address(Register r, long o, enum wb_mode mode = off)
+    : _base(r), _index(noreg), _offset(o), _acc_mode(imm), _wb_mode(mode),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+  Address(Register r, unsigned long o, enum wb_mode mode = off)
+    : _base(r), _index(noreg), _offset(o), _acc_mode(imm), _wb_mode(mode),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+  Address(Register r, unsigned int o, enum wb_mode mode = off)
+    : _base(r), _index(noreg), _offset(o), _acc_mode(imm), _wb_mode(mode),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+#ifdef ASSERT
+  Address(Register r, ByteSize disp)
+    : _base(r), _index(noreg), _offset(in_bytes(disp)), _acc_mode(imm), _wb_mode(off),
+      _shift(lsl()), _target(0) {
+    assert(!(r == r15_pc && _wb_mode == pre), "The PC can't be pre-indexed.");
+  }
+#endif
+
+
+  //Register-offset
+  Address(Register r, Register r1, shift_op shift = lsl(), enum reg_op op = ADD,
+          enum wb_mode wbm = off)
+    : _base(r), _index(r1), _offset(0), _acc_mode(reg), _wb_mode(wbm), _as_op(op),
+      _shift(shift), _target(0) {
+        assert(!shift.is_register(), "Can't shift a register-offset address by a register");
+  }
+
+  Address(address target, RelocationHolder const& rspec)
+    : _acc_mode(lit),
+      _base(sp),
+      _wb_mode(off),
+      _rspec(rspec),
+      _is_lval(false),
+      _target(target)
+      { }
+  Address(address target, relocInfo::relocType rtype = relocInfo::external_word_type);
+
+#ifdef COMPILER2
+  static Address make_raw(int base, int index, int scale, unsigned long o, relocInfo::relocType disp_reloc);
+#endif
+
+ private:
+  //Could be either
+  void AddressConstruct(Register base, RegisterOrConstant index, enum reg_op op, shift_op shift,
+                        enum wb_mode mode);
+ public:
+
+  Address(Register base, RegisterOrConstant index, enum reg_op op, enum wb_mode mode) {
+    AddressConstruct(base, index, op, lsl(), mode);
+  }
+  Address(Register base, RegisterOrConstant index, shift_op shift = lsl(), enum reg_op op = ADD,
+          enum wb_mode mode = off) {
+    if(shift.kind() != lsl().kind()) {
+      assert(index.is_register(), "should be");
+    }
+    AddressConstruct(base, index, op, shift, mode);
+  }
+
+
+  Register base() const {
+    //in aarch64 this didn't apply to preindex mode -> why?
+    guarantee(_acc_mode == imm || _acc_mode == reg, "wrong mode");
+    return _base;
+  }
+  long offset() const {
+    return _offset;
+  }
+  Register index() const {
+    return _index;
+  }
+  shift_op shift() const {
+    return _shift;
+  }
+  reg_op op() const {
+    return _as_op;
+  }
+  access_mode get_mode() const {
+    return _acc_mode;
+  }
+  wb_mode get_wb_mode() const {
+    return _wb_mode;
+  }
+  bool uses(Register reg) const { return _base == reg || _index == reg; }
+  unsigned reg_bits() { return _base->bit(_acc_mode != no_mode) | _index->bit(_acc_mode == reg); }
+  address target() const { return _target; }
+  const RelocationHolder& rspec() const { return _rspec; }
+
+  void encode(Instruction_aarch32 *i, CodeSection *sec, address pc) const;
+
+  void fp_encode(Instruction_aarch32 *i, CodeSection *sec, address pc) const;
+
+  void lea(MacroAssembler *, Register) const;
+
+  typedef enum {
+    IDT_BOOLEAN     = T_BOOLEAN,
+    IDT_CHAR        = T_CHAR,
+    IDT_FLOAT       = T_FLOAT,
+    IDT_DOUBLE      = T_DOUBLE,
+    IDT_BYTE        = T_BYTE,
+    IDT_SHORT       = T_SHORT,
+    IDT_INT         = T_INT,
+    IDT_LONG        = T_LONG,
+    IDT_OBJECT      = T_OBJECT,
+    IDT_ARRAY       = T_ARRAY,
+    IDT_ADDRESS     = T_ADDRESS,
+    IDT_METADATA    = T_METADATA,
+    // not really a data type, denotes the use when address value is needed
+    // itself, and Address instance is not used to fetch actual data from memory
+    IDT_LEA         = 100,
+    // ldrex*/strex*
+    IDT_ATOMIC      = 101,
+    // multi-word memory access insn (ldmia/stmia etc)
+    IDT_MULTIWORD
+  } InsnDataType;
+
+  inline static InsnDataType toInsnDataType(BasicType type) {
+    return (InsnDataType)type;
+  }
+
+  Address safe_for(InsnDataType type, MacroAssembler *, Register temp);
+  bool is_safe_for(InsnDataType);
+
+  static bool offset_ok_for_immed(long offset, InsnDataType type);
+  static bool shift_ok_for_index(shift_op shift, InsnDataType type);
+};
+
+// Convience classes
+class RuntimeAddress: public Address {
+  public:
+    RuntimeAddress(address target) : Address(target, relocInfo::runtime_call_type) {}
+};
+
+class OopAddress: public Address {
+  public:
+    OopAddress(address target) : Address(target, relocInfo::oop_type){}
+};
+
+class ExternalAddress: public Address {
+ private:
+  static relocInfo::relocType reloc_for_target(address target) {
+    // Sometimes ExternalAddress is used for values which aren't
+    // exactly addresses, like the card table base.
+    // external_word_type can't be used for values in the first page
+    // so just skip the reloc in that case.
+    return external_word_Relocation::can_be_relocated(target) ? relocInfo::external_word_type : relocInfo::none;
+  }
+
+ public:
+    ExternalAddress(address target) : Address(target, reloc_for_target(target)) {}
+};
+
+class InternalAddress: public Address {
+  public:
+    InternalAddress(address target) : Address(target, relocInfo::internal_word_type) {}
+};
+
+
+const int FPUStateSizeInWords = FloatRegisterImpl::number_of_registers;
+
+class Assembler : public AbstractAssembler {
+  void emit_long(jint x) {
+    AbstractAssembler::emit_int32(x);
+  }
+
+public:
+  //TODO REMOVE shift_kind from here once done
+  enum shift_kind { LSL, LSR, ASR, ROR };
+  // NOTE RRX is a special case of ROR with shift = 0#
+
+  // Helper functions for shifts
+  // Here to allow compiler to find global shift_op without :: prefix as lsl is a
+  // standalone instruction
+#define HELPER(NAME)                                                                \
+  shift_op NAME(int sft = 0) { return ::NAME(sft); }                                \
+  shift_op NAME(Register r) { return ::NAME(r); }
+  HELPER(lsl);
+  HELPER(lsr);
+  HELPER(asr);
+  HELPER(ror);
+  shift_op rrx() { return ::rrx(); }
+#undef HELPER
+
+  typedef enum {
+    EQ, NE, HS, CS=HS, LO, CC=LO, MI, PL, VS, VC, HI, LS, GE, LT, GT, LE, AL, NV
+  } Condition;
+
+  enum { instruction_size = 4 };
+
+  static const uint32_t nop_insn = 0xe1a00000;
+
+  Address adjust(Register base, int offset, bool preIncrement) {
+    if (preIncrement)
+      return Address(base, offset, Address::pre);
+    else
+      return Address(base, offset, Address::post);
+  }
+
+  Address adjust(Register base, Register index, shift_op shift,
+      enum Address::reg_op op, bool preIncrement) {
+    return Address(base, index, shift, op, preIncrement ? Address::pre : Address::post);
+  }
+
+  Address pre(Register base, int offset) {
+    return adjust(base, offset, true);
+  }
+
+  Address pre(Register base, Register index, shift_op shift, enum Address::reg_op op) {
+    return adjust(base, index, shift, op, true);
+  }
+
+  Address post (Register base, int offset) {
+    return adjust(base, offset, false);
+  }
+
+  Instruction_aarch32* current;
+
+  void set_current(Instruction_aarch32* i) { current = i; }
+
+  void f(unsigned val, int msb, int lsb) {
+    current->f(val, msb, lsb);
+  }
+  void f(unsigned val, int msb) {
+    current->f(val, msb, msb);
+  }
+  void sf(long val, int msb, int lsb) {
+    current->sf(val, msb, lsb);
+  }
+  void rf(Register reg, int lsb) {
+    current->rf(reg, lsb);
+  }
+  void rf(FloatRegister reg, int lsb) {
+    current->rf(reg, lsb);
+  }
+  void fixed(unsigned value, unsigned mask) {
+    current->fixed(value, mask);
+  }
+
+  void emit() {
+    emit_long(current->get_insn());
+    assert_cond(current->get_bits() == 0xffffffff);
+    current = NULL;
+  }
+
+  typedef void (Assembler::* uncond_branch_insn)(address dest);
+  typedef void (Assembler::* cond_branch_insn)(address dest, Condition cond);
+  typedef void (Assembler::* cond_ldst_insn)(Register Rt, address dest, Condition cond);
+  typedef void (Assembler::* cond_fp_ldst_insn)(FloatRegister Vd, address dest, Condition cond);
+
+  void wrap_label(Label &L, uncond_branch_insn insn);
+  void wrap_label(Label &L, Condition cond, cond_branch_insn insn);
+  void wrap_label(Register r, Label &L, Condition cond, cond_ldst_insn insn);
+  void wrap_label(FloatRegister r, Label &L, Condition cond, cond_fp_ldst_insn insn);
+
+#undef INSN
+
+// AARCH32 Instructions
+// Defined roughly in the order they are found in
+// ARM Archicture Reference Manual, section 5
+
+#define ZERO_ADDR_REG r0
+#define ONES_ADDR_REG r15
+
+// Data processing (register & register-shifted-register)
+  void reg_instr(int decode, shift_op shift, Condition cond, bool s) {
+    f(cond, 31, 28), f(0b000, 27, 25), f(decode, 24, 21), f(s, 20);
+    f(shift.shift(), 11, 7), f(shift.kind(), 6, 5), f(0, 4);
+  }
+  void reg_shift_reg_instr(int decode, enum shift_op::shift_kind kind,
+                           Condition cond, bool s) {
+    f(cond, 31, 28), f(0b000, 27, 25), f(decode, 24, 21), f(s, 20);
+    f(0, 7), f(kind, 6, 5), f(1, 4);
+  }
+
+#define INSN(NAME, decode, s_flg)                                                   \
+  void NAME(Register Rd, Register Rn, Register Rm, shift_op shift = S_DFLT,         \
+            Condition cond = C_DFLT) {                                              \
+    starti;                                                                         \
+    if(shift.is_register()) {                                                       \
+      reg_shift_reg_instr(decode, shift.kind(), cond, s_flg);                       \
+      rf(Rn, 16), rf(Rd, 12), rf(shift.reg(), 8), rf(Rm, 0);                        \
+    } else {                                                                        \
+      reg_instr(decode, shift, cond, s_flg);                                        \
+      rf(Rn, 16), rf(Rd, 12), rf(Rm, 0);                                            \
+    }                                                                               \
+  }
+  INSN(andr, 0b0000, 0);
+  INSN(eor,  0b0001, 0);
+  INSN(sub,  0b0010, 0);
+  INSN(rsb,  0b0011, 0);
+  INSN(add,  0b0100, 0);
+  INSN(adc,  0b0101, 0);
+  INSN(sbc,  0b0110, 0);
+  INSN(rsc,  0b0111, 0);
+  INSN(orr,  0b1100, 0);
+  INSN(bic,  0b1110, 0);
+
+  INSN(ands, 0b0000, 1);
+  INSN(eors, 0b0001, 1);
+  INSN(subs, 0b0010, 1);
+  INSN(rsbs, 0b0011, 1);
+  INSN(adds, 0b0100, 1);
+  INSN(adcs, 0b0101, 1);
+  INSN(sbcs, 0b0110, 1);
+  INSN(rscs, 0b0111, 1);
+  INSN(orrs, 0b1100, 1);
+  INSN(bics, 0b1110, 1);
+
+#undef INSN
+
+#define INSN(NAME, decode)                                                           \
+  void NAME(Register Rn, Register Rm, Condition cond) {                              \
+    NAME(Rn, Rm, S_DFLT, cond);                                                      \
+  }                                                                                  \
+  void NAME(Register Rn, Register Rm, shift_op shift = S_DFLT,                       \
+            Condition cond = C_DFLT) {                                               \
+    starti;                                                                          \
+    if(shift.is_register()) {                                                        \
+        reg_shift_reg_instr(decode, shift.kind(), cond, true);                       \
+    rf(Rn, 16), f(0b0000, 15, 12), rf(shift.reg(), 8), rf(Rm, 0);                    \
+    } else {                                                                         \
+      reg_instr(decode, shift, cond, true);                                          \
+      rf(Rn, 16), f(0, 15, 12), rf(Rm, 0);                                           \
+    }                                                                                \
+  }
+  INSN(tst, 0b1000);
+  INSN(teq, 0b1001);
+  INSN(cmp, 0b1010);
+  INSN(cmn, 0b1011);
+#undef INSN
+
+// TODO appears that if Rd = 15 and s flag set then perhaps different method
+void mov_internal(int decode, Register Rd, Register Rnm, shift_op shift, bool s, Condition cond) {
+  starti;
+  if(shift.is_register()) {
+    reg_shift_reg_instr(decode, shift.kind(), cond, s);
+    f(0b0000, 19, 16), rf(Rd, 12), rf(shift.reg(), 8), rf(Rnm, 0);
+  } else {
+    reg_instr(decode, shift, cond, s);
+    f(0, 19, 16), rf(Rd, 12), rf(Rnm, 0);
+  }
+}
+void mov(Register Rd, Register Rm, shift_op shift, Condition cond = C_DFLT) {
+  mov_internal(0b1101, Rd, Rm, shift, false, cond);
+}
+void movs(Register Rd, Register Rm, shift_op shift, Condition cond = C_DFLT) {
+  mov_internal(0b1101, Rd, Rm, shift, true, cond);
+}
+void mov(Register Rd, Register Rm, Condition cond = C_DFLT) {
+  mov_internal(0b1101, Rd, Rm, S_DFLT, false, cond);
+}
+void movs(Register Rd, Register Rm, Condition cond = C_DFLT) {
+  mov_internal(0b1101, Rd, Rm, S_DFLT, true, cond);
+}
+
+void mvn(Register Rd, Register Rm, shift_op shift, Condition cond = C_DFLT) {
+  mov_internal(0b1111, Rd, Rm, shift, false, cond);
+}
+void mvns(Register Rd, Register Rm, shift_op shift, Condition cond = C_DFLT) {
+  mov_internal(0b1111, Rd, Rm, shift, true, cond);
+}
+void mvn(Register Rd, Register Rm, Condition cond = C_DFLT) {
+  mov_internal(0b1111, Rd, Rm, S_DFLT, false, cond);
+}
+void mvns(Register Rd, Register Rm, Condition cond = C_DFLT) {
+  mov_internal(0b1111, Rd, Rm, S_DFLT, true, cond);
+}
+
+#define INSN(NAME, type, s_flg, ASSERTION)                                           \
+  void NAME(Register Rd, Register Rm, unsigned shift, Condition cond = C_DFLT) {     \
+    assert_cond(ASSERTION);                                                          \
+    if(s_flg) movs(Rd, Rm, shift_op(type, shift), cond);                             \
+    else       mov(Rd, Rm, shift_op(type, shift), cond);                             \
+  }
+  INSN(lsl, shift_op::LSL, 0, true);
+  INSN(lsr, shift_op::LSR, 0, true);
+  INSN(asr, shift_op::ASR, 0, true);
+  INSN(ror, shift_op::ROR, 0, shift != 0); //shift == 0 => RRX
+
+  INSN(lsls, shift_op::LSL, 1, true);
+  INSN(lsrs, shift_op::LSR, 1, true);
+  INSN(asrs, shift_op::ASR, 1, true);
+  INSN(rors, shift_op::ROR, 1, shift != 0); //shift == 0 => RRX
+#undef INSN
+
+#define INSN(NAME, type, s_flg)                                                      \
+  void NAME(Register Rd, Register Rm, Condition cond = C_DFLT) {                     \
+    if(s_flg) movs(Rd, Rm, shift_op(type, 0), cond);                                 \
+    else       mov(Rd, Rm, shift_op(type, 0), cond);                                 \
+  }
+  INSN(rrx,  shift_op::LSR, 0);
+  INSN(rrxs, shift_op::LSR, 1);
+#undef INSN
+
+//Data processing (register-shifted-register)
+#define INSN(NAME, type, s_flg)                                                      \
+  void NAME(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {        \
+    if(s_flg) movs(Rd, Rn, shift_op(type, Rm), cond);                                \
+    else       mov(Rd, Rn, shift_op(type, Rm), cond);                                \
+  }
+  INSN(lsl, shift_op::LSL, 0);
+  INSN(lsr, shift_op::LSR, 0);
+  INSN(asr, shift_op::ASR, 0);
+  INSN(ror, shift_op::ROR, 0);
+
+  INSN(lsls, shift_op::LSL, 1);
+  INSN(lsrs, shift_op::LSR, 1);
+  INSN(asrs, shift_op::ASR, 1);
+  INSN(rors, shift_op::ROR, 1);
+#undef INSN
+
+  bool imm_instr(int decode, Register Rd, Register Rn, int imm, Condition cond,
+                 bool s) {
+    if(!is_valid_for_imm12(imm))
+      return false;
+    {
+      starti;
+      f(cond, 31, 28), f(0b001, 27, 25), f(decode, 24, 21), f(s, 20), rf(Rn, 16);
+      int imm12 = encode_imm12(imm);
+      rf(Rd, 12), f(imm12, 11, 0);
+    }
+    return true;
+  }
+
+#define INSN(NAME, decode, s_flg)                                                    \
+  inline void NAME(Register Rd, Register Rn, unsigned imm, Condition cond = C_DFLT) {\
+    bool status = imm_instr(decode, Rd, Rn, imm, cond, s_flg);                       \
+    assert(status, "invalid imm");                                                   \
+  }
+  INSN(andr, 0b0000, 0);
+  INSN(eor,  0b0001, 0);
+  INSN(orr,  0b1100, 0);
+  INSN(bic,  0b1110, 0);
+
+  INSN(ands, 0b0000, 1);
+  INSN(eors, 0b0001, 1);
+  INSN(orrs, 0b1100, 1);
+  INSN(bics, 0b1110, 1);
+  //NOTE: arithmetic immediate instructions are defined below to allow dispatch.
+#undef INSN
+ protected:
+  // Mov data to destination register in the shortest number of instructions
+  // possible.
+  void mov_immediate(Register dst, uint32_t imm32, Condition cond, bool s);
+  // Mov data to destination register but always emit enough instructions that would
+  // permit any 32-bit constant to be loaded. (Allow for rewriting later).
+  void mov_immediate32(Register dst, uint32_t imm32, Condition cond, bool s);
+
+   void add_sub_imm(int decode, Register Rd, Register Rn, int imm,
+                   Condition cond, bool s);
+
+ public:
+#define INSN(NAME, decode, s_flg)                                                    \
+  inline void NAME(Register Rd, Register Rn, int imm, Condition cond = C_DFLT) {     \
+    add_sub_imm(decode, Rd, Rn, imm, cond, s_flg);                                   \
+  }                                                                                  \
+  inline void NAME(Register Rd, Register Rn, unsigned imm,                           \
+                   Condition cond = C_DFLT) {                                        \
+    add_sub_imm(decode, Rd, Rn, imm, cond, s_flg);                                   \
+  }                                                                                  \
+  inline void NAME(Register Rd, Register Rn, long imm, Condition cond = C_DFLT) {    \
+    add_sub_imm(decode, Rd, Rn, imm, cond, s_flg);                                   \
+  }                                                                                  \
+  inline void NAME(Register Rd, Register Rn, unsigned long imm,                      \
+                   Condition cond = C_DFLT) {                                        \
+    add_sub_imm(decode, Rd, Rn, imm, cond, s_flg);                                   \
+  }                                                                                  \
+  /*Addition dispatch - place in macroassembler?*/                                   \
+  void NAME(Register Rd, Register Rn, RegisterOrConstant operand,                    \
+           Condition cond = C_DFLT) {                                                \
+    if(operand.is_register()) {                                                      \
+      NAME(Rd, Rn, (Register)operand.as_register(), lsl(), cond);                    \
+    } else {                                                                         \
+      NAME(Rd, Rn, (unsigned)operand.as_constant(), cond);                           \
+    }                                                                                \
+  }                                                                                  \
+  inline void NAME(Register Rd, Register Rn, unsigned imm, Register Rtmp,            \
+      Condition cond = C_DFLT) {                                                     \
+    if (Assembler::operand_valid_for_add_sub_immediate(imm))                         \
+      NAME(Rd, Rn, imm, cond);                                                       \
+    else {                                                                           \
+      mov_immediate(Rtmp, imm, cond, false);                                         \
+      NAME(Rd, Rn, Rtmp, cond);                                                      \
+    }                                                                                \
+  }                                                                                  \
+  //Note that the RegisterOrConstant version can't take a shift even though
+  // one of the instructions dispatched to can
+  INSN(sub,  0b0010, 0);
+  INSN(rsb,  0b0011, 0);
+  INSN(add,  0b0100, 0);
+  INSN(adc,  0b0101, 0);
+  INSN(sbc,  0b0110, 0);
+  INSN(rsc,  0b0111, 0);
+
+  INSN(subs, 0b0010, 1);
+  INSN(rsbs, 0b0011, 1);
+  INSN(adds, 0b0100, 1);
+  INSN(adcs, 0b0101, 1);
+  INSN(sbcs, 0b0110, 1);
+  INSN(rscs, 0b0111, 1);
+#undef INSN
+  //No need to do reverse as register subtracted from immediate
+
+  // alias for mvn
+  void inv(Register Rd, Register Rn, Condition cond = C_DFLT) {
+      mvn(Rd, Rn, cond);
+  }
+  //alias for rsb
+  void neg(Register Rd, Register Rn, Condition cond = C_DFLT) {
+    rsb(Rd, Rn, 0, cond);
+  }
+  void negs(Register Rd, Register Rn, Condition cond = C_DFLT) {
+    rsbs(Rd, Rn, 0, cond);
+  }
+
+  // PC-rel. addressing
+  void adr_encode(Register Rd, int imm, Condition cond) {
+    if (is_valid_for_imm12(imm) || is_valid_for_imm12(-imm)) {
+      add_sub_imm(0b0100, Rd, r15_pc, imm, cond, false); //opcode for add
+    } else {
+      int adjust = 0;
+      if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))  {
+        adjust = 8; // mov_w/mov_t
+      } else {
+        adjust = 16; // mov and 3 orr
+      }
+      mov_immediate32(Rd, imm - adjust, cond, false);
+      add(Rd, r15_pc, Rd, cond);
+    }
+  }
+
+  void adr(Register Rd, address dest, Condition cond = C_DFLT);
+
+  void adr(Register Rd, const Address &dest, Condition cond = C_DFLT);
+
+  void adr(Register Rd, Label &L, Condition cond = C_DFLT) {
+    wrap_label(Rd, L, cond, &Assembler::Assembler::adr);
+  }
+
+private:
+  friend void entry(CodeBuffer *cb);
+#define INSN(NAME, decode, s_flg)                                                    \
+  inline void NAME(Register Rd, unsigned imm, Condition cond = C_DFLT) {             \
+    bool status = imm_instr(decode, Rd, ZERO_ADDR_REG, imm, cond, s_flg);            \
+    assert(status, "invalid imm");                                                   \
+  }                                                                                  \
+  inline void NAME(Register Rd, int imm, Condition cond = C_DFLT) {                  \
+   bool status = imm_instr(decode, Rd, ZERO_ADDR_REG, imm, cond, s_flg);             \
+   assert(status, "invalid imm");                                                    \
+  }
+public:
+
+  INSN(mov_i, 0b1101, 0);
+  INSN(mvn_i, 0b1111, 0);
+
+  INSN(movs_i, 0b1101, 1);
+  INSN(mvns_i, 0b1111, 1);
+#undef INSN
+
+  void movw_i(Register Rd, unsigned imm, Condition cond = C_DFLT) {
+    starti;
+    assert(imm < (1 << 16), "Immediate too big for movw");
+    f(cond, 31, 28), f(0b00110000, 27, 20), f(imm >> 12, 19, 16);
+    rf(Rd, 12), f(imm & 0xfff, 11, 0);
+  }
+
+  void movt_i(Register Rd, unsigned imm, Condition cond = C_DFLT) {
+    starti;
+    assert(imm < (1 << 16), "Immediate too big for movt");
+    f(cond, 31, 28), f(0b00110100, 27, 20), f(imm >> 12, 19, 16);
+    rf(Rd, 12), f(imm & 0xfff, 11, 0);
+  }
+
+#define INSN(NAME, decode)                                                              \
+  inline void NAME(Register Rn, int imm, Condition cond = C_DFLT) {                     \
+    bool status = imm_instr(decode, ZERO_ADDR_REG, Rn, imm, cond, true);                \
+    assert(status, "invalid imm");                                                      \
+  }                                                                                     \
+  inline void NAME(Register Rn, unsigned imm, Condition cond = C_DFLT) {                \
+    bool status = imm_instr(decode, ZERO_ADDR_REG, Rn, imm, cond, true);                \
+    assert(status, "invalid imm");                                                      \
+  }                                                                                     \
+  inline void NAME(Register Rn, int imm, Register Rtmp, Condition cond = C_DFLT) {      \
+    if (Assembler::operand_valid_for_add_sub_immediate(imm))                            \
+      NAME(Rn, imm, cond);                                                              \
+    else {                                                                              \
+      mov_immediate(Rtmp, imm, cond, false);                                            \
+      NAME(Rn, Rtmp, cond);                                                             \
+    }                                                                                   \
+  }                                                                                     \
+  inline void NAME(Register Rn, unsigned imm, Register Rtmp, Condition cond = C_DFLT) { \
+    if (Assembler::operand_valid_for_add_sub_immediate(imm))                            \
+      NAME(Rn, imm, cond);                                                              \
+    else {                                                                              \
+      mov_immediate(Rtmp, imm, cond, false);                                            \
+      NAME(Rn, Rtmp, cond);                                                             \
+    }                                                                                   \
+  }
+  INSN(tst, 0b1000);
+  INSN(teq, 0b1001);
+  INSN(cmp, 0b1010);
+  INSN(cmn, 0b1011);
+#undef INSN
+
+
+// Multiply and multiply accumulate
+  void mult_instr(int decode, Register a, Register b, Register c,
+                  Register d, Condition cond, bool s) {
+    starti;
+    f(cond, 31, 28), f(0b0000, 27, 24), f(decode, 23, 21), f(s, 20);
+    rf(a, 16), rf(b, 12), rf(c, 8), rf(d, 0), f(0b1001, 7, 4);
+  }
+
+  void mul(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {
+    mult_instr(0b000, Rd, ZERO_ADDR_REG, Rm, Rn, cond, false);
+  }
+  void muls(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {
+    mult_instr(0b000, Rd, ZERO_ADDR_REG, Rm, Rn, cond, true);
+  }
+
+  void mla(Register Rd, Register Rn, Register Rm, Register Ra, Condition cond = C_DFLT) {
+    mult_instr(0b001, Rd, Ra, Rm, Rn, cond, false);
+  }
+  void mlas(Register Rd, Register Rn, Register Rm, Register Ra, Condition cond = C_DFLT) {
+    mult_instr(0b001, Rd, Ra, Rm, Rn, cond, true);
+  }
+
+  void mls(Register Rd, Register Rn, Register Rm, Register Ra, Condition cond = C_DFLT) {
+    mult_instr(0b011, Rd, Ra, Rm, Rn, cond, false);
+  }
+
+  void umaal(Register RdLo, Register RdHi, Register Rn, Register Rm, Condition cond = C_DFLT) {
+    mult_instr(0b010, RdHi, RdLo, Rm, Rn, cond, false);
+  }
+
+#define INSN(NAME, decode, s_flg)                                                    \
+  void NAME(Register RdLo, Register RdHi, Register Rn, Register Rm,                  \
+            Condition cond = C_DFLT) {                                               \
+    mult_instr(decode, RdHi, RdLo, Rm, Rn, cond, s_flg);                             \
+  }
+  INSN(umull, 0b100, 0);
+  INSN(umlal, 0b101, 0);
+  INSN(smull, 0b110, 0);
+  INSN(smlal, 0b111, 0);
+
+  INSN(umulls, 0b100, 1);
+  INSN(umlals, 0b101, 1);
+  INSN(smulls, 0b110, 1);
+  INSN(smlals, 0b111, 1);
+
+#undef INSN
+
+//Saturating addition and subtraction
+#define INSN(NAME, decode)                                                           \
+  void NAME(Register Rd, Register Rm, Register Rn, Condition cond = C_DFLT) {        \
+    starti;                                                                          \
+    f(cond, 31, 28), f( 0b00010, 27, 23), f(decode, 22, 21), f(0, 20);               \
+    rf(Rn, 16), rf(Rd, 12), f( 0b00000101, 11, 4),  rf(Rm, 0);                       \
+  }
+  INSN(qadd,  0b00);
+  INSN(qsub,  0b01);
+  INSN(qdadd, 0b10);
+  INSN(qdsub, 0b11);
+#undef INSN
+
+// Halfword multiply and multiply accumulate
+  void mul_instr(int decode, Register Ra, Register Rb, Register Rc, Register Rd,
+                 bool N, bool M, Condition cond) {
+      starti;
+      f(cond, 31, 28), f(0b00010, 27, 23), f(decode, 22, 21), f(0, 20);
+      rf(Ra, 16), rf(Rb, 12), rf(Rc, 8), f(1, 7), f(M, 6), f(N, 5), f(0, 4);
+      rf(Rd, 0);
+  }
+
+#define INSN(NAME, decode, N, M)                                                     \
+  void NAME(Register Rd, Register Rn, Register Rm, Register Ra,                      \
+            Condition cond = C_DFLT) {                                               \
+    mul_instr(decode, Rd, Ra, Rm, Rn, N, M, cond);                                   \
+  }
+  INSN(smlabb, 0b00, 0, 0);
+  INSN(smlabt, 0b00, 0, 1)
+  INSN(smlatb, 0b00, 1, 0)
+  INSN(smlatt, 0b00, 1, 1)
+
+  INSN(smlawb, 0b01, 0, 0);
+  INSN(smlawt, 0b01, 0, 1);
+#undef INSN
+
+#define INSN(NAME, decode, N, M)                                                     \
+  void NAME(Register RdLo, Register RdHi, Register Rn, Register Rm,                  \
+            Condition cond = C_DFLT) {                                               \
+    mul_instr(decode, RdHi, RdLo, Rm, Rn, N, M, cond);                               \
+  }
+  INSN(smlalbb, 0b10, 0, 0);
+  INSN(smlalbt, 0b10, 0, 1);
+  INSN(smlaltb, 0b10, 1, 0);
+  INSN(smlaltt, 0b10, 1, 1);
+#undef INSN
+
+#define INSN(NAME, decode, N, M)                                                     \
+  void NAME(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {        \
+    mul_instr(decode, Rd, ZERO_ADDR_REG, Rm, Rn, N, M, cond);                        \
+  }
+  INSN(smulwb, 0b01, 1, 0);
+  INSN(smulwt, 0b01, 1, 1);
+
+  INSN(smulbb, 0b11, 0, 0);
+  INSN(smulbt, 0b11, 0, 1);
+  INSN(smultb, 0b11, 1, 0);
+  INSN(smultt, 0b11, 1, 1);
+#undef INSN
+
+// For Extra load/store instructions, see load/store section
+// For Synchronization primitives, see load/store section
+
+// MSR(immediate), and hints
+#define INSN(NAME, decode)                                                           \
+  void NAME(Condition cond = C_DFLT) {                                               \
+    starti;                                                                          \
+    f(cond, 31, 28), f(0b001100100000, 27, 16), f(0b11110000, 15, 8);                \
+    f(decode, 7, 0);                                                                 \
+  }
+  INSN(nop,   0b000);
+  INSN(yield, 0b001);
+  INSN(wfe,   0b010);
+  INSN(wfi,   0b011);
+  INSN(sev,   0b100);
+  void dbg(int dbg_hint, Condition cond = C_DFLT) {
+    f(cond, 31, 28), f(0b001100100000, 27, 16), f(0b11110000, 15, 8);
+    f(0b1111, 7, 4); f(dbg_hint, 3, 0);
+  }
+#undef INSN
+
+  //TODO Misc instructions
+  void bkpt(unsigned imm) {
+    starti;
+    f(AL, 31, 28), f(0b00010010, 27, 20);
+    f(imm >> 4, 19, 8), f(0b0111, 7, 4), f(imm & 0xf, 3, 0);
+  }
+  void hlt(unsigned imm) {
+    bkpt(imm);
+    // FIXME This seemed like the best option!
+  }
+
+  // Load/store register (all modes)
+  void load_store_instr(Register Rt, const Address &adr, int op, int op2, int a, int b,
+                        Condition cond) {
+    starti;
+    f(cond, 31, 28), f(op, 27, 25), f(a, 22), f(b, 20);
+    if(op2 >= 0)
+      f(op2, 7, 4);
+    //Destination
+    rf(Rt, 12);
+    adr.encode(current, code_section(), pc());
+  }
+
+  bool encodeable(int decode, address dest) {
+    long offset = dest - pc();
+    switch(decode) {
+      case 0b010:
+        // LDR, LDRB, STR, STRB
+        return uabs(offset) < (1 << 12);
+      case 0b000:
+        //LDRD, LDRH, LDRSB, LDRSH, STRH, STRD
+        return uabs(offset) < (1 << 8);
+      default:
+        ShouldNotReachHere();
+    }
+    return false;
+  }
+
+
+#define INSN_INT(NAME, op, op2, a, b, isload)                                        \
+  void NAME(Register Rt, address dest, Condition cond = C_DFLT) {                    \
+    if(encodeable(op, dest)) { /* Plan A */                                          \
+      long offset = dest - pc();                                                     \
+      NAME(Rt, Address(r15_pc, offset), cond);                                       \
+    } else if(isload){ /* Plan B */                                                  \
+      /* TODO check we don't have to relocate this*/                                 \
+      mov_immediate(Rt, (uint32_t)dest, cond, false);                                \
+      NAME(Rt, Address(Rt, 0), cond);                                                \
+    } else { /* There is no plan C */                                                \
+      ShouldNotReachHere();                                                          \
+    }                                                                                \
+  }                                                                                  \
+  void NAME(Register Rt, address dest, relocInfo::relocType rtype,                   \
+            Condition cond = C_DFLT) {                                               \
+    guarantee(rtype == relocInfo::internal_word_type,                                \
+              "only internal_word_type relocs make sense here");                     \
+    NAME(Rt, InternalAddress(dest), cond);                                           \
+  }                                                                                  \
+  void NAME(Register Rt, Label &L, Condition cond = C_DFLT) {                        \
+    wrap_label(Rt, L, cond, &Assembler::NAME);                                       \
+  }
+
+#define INSN(NAME, op, op2, a, b, isload)                                            \
+  void NAME(Register Rt, const Address &adr, Condition cond = C_DFLT) {              \
+    load_store_instr(Rt, adr, op, op2, a, b, cond);                                  \
+  }                                                                                  \
+  INSN_INT(NAME, op, op2, a, b, isload);
+  INSN(ldr,   0b010,     -1, 0, 1, 1);
+  INSN(ldrb,  0b010,     -1, 1, 1, 1);
+
+  INSN(ldrsb, 0b000, 0b1101, 0, 1, 1);
+  INSN(ldrh,  0b000, 0b1011, 0, 1, 1);
+  INSN(ldrsh, 0b000, 0b1111, 0, 1, 1);
+
+  INSN(str,   0b010,     -1, 0, 0, 0);
+  INSN(strb,  0b010,     -1, 1, 0, 0);
+  INSN(strh,  0b000, 0b1011, 0, 0, 0);
+  //Note LDRD & STRD are defined with the load/store multiple instructions
+
+  //TODO Need to introduce ldrsb ldrsh - then check that the encoding works properly!
+#undef INSN
+
+
+  //Synchronization primitives
+  void sync_instr(int decode, Register Ra, Register Rb, Register Rc, Register Rd,
+             Condition cond) {
+    starti;
+    f(cond, 31, 28), f(0b0001, 27, 24), f(decode, 23, 20), rf(Ra, 16), rf(Rb, 12);
+    rf(Rc, 8), f(0b1001, 7, 4), rf(Rd, 0);
+  }
+
+#define INSN(NAME, decode)                                                           \
+  void NAME(Register Rd, Register Rt, Register Rn, Condition cond = C_DFLT) {        \
+    assert(r15_pc != Rn, "Unpredictable");                                           \
+    sync_instr(decode, Rn, Rd, ONES_ADDR_REG, Rt, cond);                             \
+  }                                                                                  \
+  void NAME(Register Rd, Register Rt, Address a, Condition cond = C_DFLT) {          \
+    assert(a.get_mode() == Address::imm, "must be");                                 \
+    assert(a.offset() == 0, "unsupported");                                          \
+    NAME(Rd, Rt, a.base(), cond);                                                    \
+  }
+  INSN( strex, 0b1000);
+  INSN(strexd, 0b1010);
+  INSN(strexb, 0b1100);
+  INSN(strexh, 0b1110);
+#undef INSN
+
+#define INSN(NAME, decode)                                                           \
+  void NAME(Register Rt, Register Rn, Condition cond = C_DFLT) {                     \
+    assert(r15_pc != Rn, "Unpredictable");                                           \
+    sync_instr(decode, Rn, Rt, ONES_ADDR_REG, ONES_ADDR_REG, cond);                  \
+  }                                                                                  \
+  void NAME(Register Rt, Address a, Condition cond = C_DFLT) {                       \
+    assert(a.get_mode() == Address::imm, "must be");                                 \
+    assert(a.offset() == 0, "unsupported");                                          \
+    NAME(Rt, a.base(), cond);                                                        \
+    }
+  INSN(ldrex,  0b1001);
+  INSN(ldrexd, 0b1011);
+  INSN(ldrexb, 0b1101);
+  INSN(ldrexh, 0b1111);
+#undef INSN
+
+// Media instructions
+void media_instr(int decode, int decode2, Condition cond) {
+  f(cond, 31, 28), f(0b011, 27, 25), f(decode, 24, 20);
+  f(decode2, 7, 5), f(1, 4);
+}
+
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {        \
+    starti;                                                                          \
+    media_instr(0b00000 | decode, decode2, cond);                                    \
+    rf(Rn, 16), rf(Rd, 12), f(0b1111, 11, 8), rf(Rm, 0);                             \
+  }
+  INSN(sadd16, 0b01, 0b000);
+  INSN(sasx,   0b01, 0b001);
+  INSN(ssax,   0b01, 0b010);
+  INSN(ssub16, 0b01, 0b011);
+  INSN(sadd8,  0b01, 0b100);
+  INSN(ssub8,  0b01, 0b111);
+  //Saturating
+  INSN(qadd16, 0b10, 0b000);
+  INSN(qasx,   0b10, 0b001);
+  INSN(qsax,   0b10, 0b010);
+  INSN(qsub16, 0b10, 0b011);
+  INSN(qadd8,  0b10, 0b100);
+  INSN(qsub8,  0b10, 0b111);
+  //Halving
+  INSN(shadd16, 0b11, 0b000);
+  INSN(shasx,   0b11, 0b001);
+  INSN(shsax,   0b11, 0b010);
+  INSN(shsub16, 0b11, 0b011);
+  INSN(shadd8,  0b11, 0b100);
+  INSN(shsub8,  0b11, 0b111);
+
+  //Now unsigned
+  INSN(uadd16, 0b101, 0b000);
+  INSN(uasx,   0b101, 0b001);
+  INSN(usax,   0b101, 0b010);
+  INSN(usub16, 0b101, 0b011);
+  INSN(uadd8,  0b101, 0b100);
+  INSN(usub8,  0b101, 0b111);
+  //Saturating
+  INSN(uqadd16, 0b110, 0b000);
+  INSN(uqasx,   0b110, 0b001);
+  INSN(uqsax,   0b110, 0b010);
+  INSN(uqsub16, 0b110, 0b011);
+  INSN(uqadd8,  0b110, 0b100);
+  INSN(uqsub8,  0b110, 0b111);
+  //Halving
+  INSN(uhadd16, 0b111, 0b000);
+  INSN(uhasx,   0b111, 0b001);
+  INSN(uhsax,   0b111, 0b010);
+  INSN(uhsub16, 0b111, 0b011);
+  INSN(uhadd8,  0b111, 0b100);
+  INSN(uhsub8,  0b111, 0b111);
+#undef INSN
+
+//Packing, unpacking, saturation and reversal
+// Note rotation can only be one of ROR #0 ROR #8 ROR #16 ROR #24
+void extend_instr(int decode, int decode2, int decode3, Register Rd, Register Rn,
+                  Register Rm, shift_op shift, Condition cond) {
+  starti;
+  assert(0 == shift.shift() ||
+         shift_op::ROR == shift.kind(), "Only ROR may be used for op");
+  // All zero shifts are mapped to LSL #0
+  int shift_enc = 0;
+  switch(shift.shift()) {
+    case 0:                 break;
+    case 8:  shift_enc = 1; break;
+    case 16: shift_enc = 2; break;
+    case 24: shift_enc = 3; break;
+    default: assert(false, "Invalid shift quantity");
+  }
+  media_instr(0b01000 | decode, decode2, cond);
+  rf(Rn, 16), rf(Rd, 12), f(shift_enc, 11, 10), f(decode3, 9, 8), rf(Rm, 0);
+}
+void extend_instr(int decode, int decode2, int decode3, Register Rd, Register Rn,
+                  unsigned imm, Condition cond) {
+  starti;
+  media_instr(0b01000 | decode, decode2, cond);
+  rf(Rn, 0), rf(Rd, 12), f(decode3, 11, 8), f(imm, 19, 16);
+}
+
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rn, Register Rm, shift_op shift = ::ror(),         \
+            Condition cond = C_DFLT) {                                               \
+    assert(0xf != Rn->encoding_nocheck(), "Rn = pc makes different instruction");    \
+    extend_instr(decode, decode2, 0b00, Rd, Rn, Rm, shift, cond);                    \
+  }
+  INSN(sxtab16, 0b000, 0b011);
+  INSN(sxtab,   0b010, 0b011);
+  INSN(sxtah,   0b011, 0b011);
+  INSN(uxtab16, 0b100, 0b011);
+  INSN(uxtab,   0b110, 0b011);
+  INSN(uxtah,   0b111, 0b011);
+#undef INSN
+
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rm, shift_op shift = ::ror(),                      \
+            Condition cond = C_DFLT) {                                               \
+    extend_instr(decode, decode2, 0b00, Rd, ONES_ADDR_REG, Rm, shift, cond);         \
+  }
+  INSN(sxtb16, 0b000, 0b011);
+  INSN(sxtb,   0b010, 0b011);
+  INSN(sxth,   0b011, 0b011);
+  INSN(uxtb16, 0b100, 0b011);
+  INSN(uxtb,   0b110, 0b011);
+  INSN(uxth,   0b111, 0b011);
+#undef INSN
+
+#define INSN(NAME, decode, decode2) \
+  void NAME(Register Rd, unsigned imm, Register Rn, Condition cond = C_DFLT) { \
+    extend_instr(decode, decode2, 0b1111, Rd, Rn, imm, cond); \
+  }
+  INSN(usat16, 0b110, 0b001);
+#undef INSN
+
+  //Reverse instructions
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rm, Condition cond = C_DFLT) {                     \
+    extend_instr(decode, decode2, 0b11, Rd, ONES_ADDR_REG, Rm, ::ror(24), cond);     \
+  }
+  INSN(rev,   0b011, 0b001);
+  INSN(rev16, 0b011, 0b101);
+  INSN(rbit,  0b111, 0b001);
+  INSN(revsh, 0b111, 0b101);
+#undef INSN
+
+// Signed multiply, signed and unsigned divide
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {        \
+    starti;                                                                          \
+    media_instr(0b10000 | decode, decode2, cond);                                    \
+    rf(Rd, 16), f(0b1111, 15, 12), rf(Rm, 8), rf(Rn, 0);                             \
+  }
+  INSN(sdiv,   0b001, 0b000);
+  INSN(udiv,   0b011, 0b000);
+  INSN(smuad,  0b000, 0b000);
+  INSN(smuadx, 0b000, 0b001);
+  INSN(smusd,  0b000, 0b010);
+  INSN(smusdx, 0b000, 0b011);
+  INSN(smmul,  0b101, 0b000);
+  INSN(smmulr, 0b101, 0b001);
+  //TODO ALL THE REST!
+#undef INSN
+
+// Remainder of things
+#define INSN(NAME, decode, decode2)                                                  \
+  void NAME(Register Rd, Register Rn, int lsb, int width,                            \
+            Condition cond = C_DFLT) {                                               \
+    starti;                                                                          \
+    assert(lsb >= 0 && lsb < 32, "lsb out of range");                                \
+    assert(width > 0 && width <= 32 - lsb, "width out of range");                    \
+    media_instr(decode, decode2, cond);                                              \
+    f(width - 1, 20, 16), rf(Rd, 12), f(lsb, 11, 7), rf(Rn, 0);                      \
+  }
+  INSN(sbfx, 0b11010, 0b010);
+  INSN(ubfx, 0b11110, 0b010);
+#undef INSN
+
+void bfi(Register Rd, Register Rn, int lsb, int width, Condition cond = C_DFLT) {
+  assert(VM_Version::features() & (FT_ARMV6T2 | FT_ARMV7), "unsupported on the cpu");
+  int msb = lsb + width - 1;
+  assert(lsb >= 0 && lsb < 32, "lsb out of range");
+  assert(msb < 32 && msb >= lsb, "width out of range");
+  starti;
+  media_instr(0b11100, 0b000, cond);
+  f(msb, 20, 16), rf(Rd, 12), f(lsb, 11, 7), rf(Rn, 0);
+}
+
+void bfc(Register Rd, int lsb, int width, Condition cond = C_DFLT) {
+  assert(VM_Version::features() & (FT_ARMV6T2 | FT_ARMV7), "unsupported on the cpu");
+  int msb = lsb + width - 1;
+  assert(lsb >= 0 && lsb < 32, "lsb out of range");
+  assert(msb < 32 && msb >= lsb, "width out of range");
+  starti;
+  media_instr(0b11100, 0b000, cond);
+  f(msb, 20, 16), rf(Rd, 12), f(lsb, 11, 7), f(0b1111, 3, 0);
+}
+
+void clz(Register Rd, Register Rm, Condition cond = C_DFLT) {
+  assert(Rd != r15_pc && Rm != r15_pc, "must be");
+  starti;
+  f(cond, 31, 28), f(0b000101101111, 27, 16), rf(Rd, 12);
+  f(0b11110001, 11, 4), rf(Rm, 0);
+}
+
+//Branch, branch with link, and block data transfer
+
+void block_imm_instr(int decode, int w, Register Rn, unsigned regset,
+                     Condition cond) {
+  starti;
+  f(cond, 31, 28), f(0b10, 27, 26), f(decode | (w << 1), 25, 20);
+  rf(Rn, 16), f(regset, 15, 0);
+}
+#define INSN(NAME, decode)                                                           \
+  void NAME(Register Rn, unsigned regset, bool wb = true, Condition cond = C_DFLT) { \
+    block_imm_instr(decode, wb, Rn, regset, cond);                                   \
+  }
+  INSN(stmda, 0b000000);
+  INSN(stmed, 0b000000);
+
+  INSN(ldmda, 0b000001);
+  INSN(ldmfa, 0b000001);
+
+  //INSN(stm,   0b001000);
+  INSN(stmia, 0b001000);
+  INSN(stmea, 0b001000);
+
+  //INSN(ldm,   0b001001);
+  INSN(ldmia, 0b001001);
+  INSN(ldmfd, 0b001001);
+
+  INSN(stmdb, 0b010000);
+  INSN(stmfd, 0b010000);
+
+  INSN(ldmdb, 0b010001);
+  INSN(ldmea, 0b010001);
+
+  INSN(stmib, 0b011000);
+  INSN(stmfa, 0b011000);
+
+  INSN(ldmib, 0b011001);
+  INSN(ldmed, 0b011001);
+#undef INSN
+
+unsigned count_bits(unsigned val);
+bool can_ldst_multiple( unsigned regset, const Address& adr);
+
+//NOTE!! Have repurposed stm and ldm for auto dispatch instructions
+#define INSN(NAME, PREFIX)                                                           \
+  void NAME(unsigned regset, const Address& adr, Condition cond = C_DFLT) {          \
+    assert(can_ldst_multiple(regset, adr), "Can't do anything with this!");          \
+    int offset = adr.offset();                                                       \
+    switch(adr.get_wb_mode()) {                                                      \
+      case Address::pre:                                                             \
+        if(offset > 0) PREFIX##mib(adr.base(), regset, true, cond);                  \
+        else           PREFIX##mdb(adr.base(), regset, true, cond);                  \
+        break;                                                                       \
+      case Address::post:                                                            \
+        if(offset > 0) PREFIX##mia(adr.base(), regset, true, cond);                  \
+        else           PREFIX##mda(adr.base(), regset, offset != 0, cond);           \
+        break;                                                                       \
+      case Address::off:                                                             \
+        if(offset > 0)   PREFIX##mib(adr.base(), regset, false, cond);               \
+        else if(!offset) PREFIX##mia(adr.base(), regset, false, cond);               \
+        else             PREFIX##mdb(adr.base(), regset, false, cond);               \
+        break;                                                                       \
+      default:                                                                       \
+        ShouldNotReachHere();                                                        \
+    }                                                                                \
+  }
+  INSN(ldm, ld);
+  INSN(stm, st);
+#undef INSN
+
+//Made push and pop operate on full descending stacks
+#define INSN(NAME, CNAME)                                                            \
+  inline void NAME(unsigned regset, Condition cond = C_DFLT) {                       \
+    CNAME(r13, regset, true, cond);                                                  \
+  }
+  INSN(pop,  ldmia);
+  INSN(push, stmdb);
+#undef INSN
+
+ public:
+
+#define INSN(NAME, PREFIX, op, op2, a, b, isload)                                    \
+  void NAME(Register Rt, const Address& adr, Condition cond = C_DFLT) {              \
+    load_store_instr(Rt, adr, op, op2, a, b, cond);                                  \
+  }                                                                                  \
+  INSN_INT(NAME, op, op2, a, b, isload);
+
+  INSN(ldrd, ld, 0b000, 0b1101, 0, 0, 1);
+  INSN(strd, st, 0b000, 0b1111, 0, 0, 0);
+#undef INSN
+#undef INSN_INT
+
+  // Branches
+
+  // For immediate branches:
+  // The maximum range of a branch is fixed for the aarch32
+  // architecture.  In debug mode we shrink it in order to test
+  // trampolines, but not so small that branches in the interpreter
+  // are out of range. Compiler2 is ported in the assumption that code cache is
+  // always reachable with immediate branch, so cannot restrict the size
+  static const unsigned long branch_range =
+      COMPILER2_PRESENT(32 * M) NOT_COMPILER2(NOT_DEBUG(32 * M) DEBUG_ONLY(2 * M));
+  static bool reachable_from_branch_at(address branch, address target) {
+    return uabs(target - branch) < branch_range;
+  }
+
+  void branch_imm_instr(int decode, address dest, Condition cond) {
+    starti;
+    // Correct PC for as it will be when executing this instruction
+    int offset = (dest - (pc() + 8)) >> 2;
+    assert(reachable_from_branch_at(pc(), dest), "branch target unreachable");
+    f(cond, 31, 28), f(decode, 27, 24), sf(offset, 23, 0);
+  }
+
+  void branch_reg_instr(int decode, Register Rm, Condition cond) {
+    starti;
+    f(cond, 31, 28), f(0b00010010, 27, 20);
+    f(0b111111111111, 19, 8), f(decode, 7, 4), rf(Rm, 0);
+  }
+
+#define INSN(NAME, decode_imm, decode_reg)                                           \
+  void NAME(Register Rm, Condition cond = C_DFLT) {                                  \
+    branch_reg_instr(decode_reg, Rm, cond);                                          \
+  }                                                                                  \
+  void NAME(address dest, Condition cond = C_DFLT) {                                 \
+    branch_imm_instr(decode_imm, dest, cond);                                        \
+  }                                                                                  \
+  void NAME(Label &L, Condition cond = C_DFLT) {                                     \
+    wrap_label(L, cond, &Assembler::NAME);                                           \
+  }                                                                                  \
+  void NAME(const Address &dest, Condition cond = C_DFLT) {                          \
+    code_section()->relocate(pc(), dest.rspec());                                    \
+    NAME(dest.target(), cond);                                                       \
+  }
+  //TODO assert type of address
+  INSN(b,  0b1010, 0b0001); // B & BX
+  INSN(bl, 0b1011, 0b0011); // BL & BLX
+#undef INSN
+
+
+//TODO Coprocessor instructions, and Supervisor Call
+
+
+// Unconditional Instructions
+  enum barrier {OSHST = 0b0010, OSH,
+                NSHST = 0b0110, NSH,
+                ISHST = 0b1010, ISH,
+                   ST = 0b1110, SY};
+
+  void sync_instr(int decode, enum barrier option) {
+    starti;
+    f(0b11110, 31, 27), f(0b1010111, 26, 20), f(0b111111110000, 19, 8);
+    f(decode, 7, 4), f(option, 3, 0);
+  }
+  void clrex() {
+    sync_instr(0b0001, SY);
+  }
+  void dsb(enum barrier option) {
+    sync_instr(0b0100, option);
+  }
+  void dmb(enum barrier option) {
+    sync_instr(0b0101, option);
+  }
+  void bkpt();
+  void isb() {
+    sync_instr(0b0110, SY);
+  }
+
+  void udf(int imm_16) {
+    assert((imm_16 >> 16) == 0, "encoding constraint");
+    emit_int32(0xe7f000f0 | (imm_16 & 0xfff0) << 8 | (imm_16 & 0xf));
+  }
+
+  // And the relevant instructions for ARMv6.
+
+  // MCR<c> <coproc>, <opc1>, <Rt>, <CRn>, <CRm>{, <opc2>}
+  void mcr(int cpc_dex, int opc1, Register Rt, int cpc_reg_dex1,
+           int cpc_reg_dex2, int opc2, Condition cond = C_DFLT) {
+    starti;
+    f(cond, 31, 28), f(0b1110, 27, 24), f(opc1, 23, 21), f(0, 20);
+    f(cpc_reg_dex1, 19, 16), rf(Rt, 12), f(cpc_dex, 11, 8);
+    f(opc2, 7, 5), f(1, 4), f(cpc_reg_dex2, 3, 0);
+  }
+
+  // These instructions do not read the value of the register passed,
+  // can be any. Chosen r0.
+  void cp15dmb(Condition cond = C_DFLT) {
+    mcr(15, 0, r0, 7, 10, 5, cond);
+  }
+
+  void cp15dsb(Condition cond = C_DFLT) {
+    mcr(15, 0, r0, 7, 10, 4, cond);
+  }
+
+  void cp15isb(Condition cond = C_DFLT) {
+    mcr(15, 0, r0, 7, 5, 4, cond);
+  }
+
+  enum Membar_mask_bits {
+    // We can use ISH for a barrier because the ARM ARM says "This
+    // architecture assumes that all Processing Elements that use the
+    // same operating system or hypervisor are in the same Inner
+    // Shareable shareability domain."
+    StoreStore = ISHST,
+    LoadStore  = ISH, //ISHLD, Changed to
+    LoadLoad   = ISH, //ISHLD,
+    StoreLoad  = ISH,
+    AnyAny     = ISH
+  };
+
+  void mrs(Register Rd, Condition cond = C_DFLT) {
+    starti;
+    f(cond, 31, 28), f(0b00010, 27, 23), f(0, 22), f(0b00, 21, 20), f(0b1111, 19, 16);
+    rf(Rd, 12), f(0b000000000000, 11, 0);
+  }
+
+  void msr(Register Rn, bool nzcvq = true, bool g = true, Condition cond = C_DFLT) {
+    starti;
+    f(cond, 31, 28), f(0b00010, 27, 23), f(0, 22), f(0b10, 21, 20);
+    f(nzcvq ? 1 : 0, 19), f(g ? 1 : 0, 18), f(0b00, 17, 16);
+    f(0b111100000000, 15, 4), rf(Rn, 0);
+  }
+
+// Floating point operations
+
+enum fpscr_cond { FP_EQ = 0b0110 << 28,
+                  FP_LT = 0b1000 << 28,
+                  FP_GT = 0b0010 << 28,
+                  FP_UN = 0b0011 << 28,
+                  FP_MASK = 0b1111 << 28 };
+
+  void fp_instr_base(bool is64bit, Condition cond) {
+    f(cond, 31, 28), f(0b1110, 27, 24), f(0b101, 11, 9), f(is64bit, 8), f(0, 4);
+  }
+
+  void fp_rencode(FloatRegister reg, bool is64bit, int base, int bit) {
+    int reg_val = reg->encoding_nocheck();
+    if(!is64bit) {
+      f( reg_val >> 1, base + 3, base);
+      f( reg_val & 1, bit);
+    } else {
+      f( reg_val & 0xf, base + 3, base);
+      f( reg_val >> 4, bit);
+    }
+  }
+
+  void fp_instr(int decode, int op, bool is64bit, FloatRegister Rd, FloatRegister Rn,
+                FloatRegister Rm, Condition cond) {
+    fp_instr_base(is64bit, cond);
+    f(decode, 23, 20), f(op, 6);
+    // Register encoding is a bit involved
+    // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+    fp_rencode(Rn, false, 16, 7);
+    fp_rencode(Rd, false, 12, 22);
+    fp_rencode(Rm, false,  0, 5);
+  }
+
+#define INSN(NAME, decode, op, is64bit)                                              \
+  void NAME(FloatRegister Rd, FloatRegister Rn, FloatRegister Rm,                    \
+            Condition cond = C_DFLT) {                                               \
+    starti;                                                                          \
+    fp_instr(decode, op, is64bit, Rd, Rn, Rm, cond);                                 \
+  }
+  INSN(vmla_f32,  0b0000, 0, 0);
+  INSN(vmla_f64,  0b0000, 0, 1);
+  INSN(vmls_f32,  0b0000, 1, 0);
+  INSN(vmls_f64,  0b0000, 1, 1);
+
+  INSN(vnmla_f32, 0b0001, 1, 0);
+  INSN(vnmla_f64, 0b0001, 1, 1);
+  INSN(vnmls_f32, 0b0001, 0, 0);
+  INSN(vnmls_f64, 0b0001, 0, 1);
+  INSN(vnmul_f32, 0b0010, 1, 0);
+  INSN(vnmul_f64, 0b0010, 1, 1);
+  INSN(vmul_f32,  0b0010, 0, 0);
+  INSN(vmul_f64,  0b0010, 0, 1);
+
+  INSN(vadd_f32,  0b0011, 0, 0);
+  INSN(vadd_f64,  0b0011, 0, 1);
+  INSN(vsub_f32,  0b0011, 1, 0);
+  INSN(vsub_f64,  0b0011, 1, 1);
+
+  INSN(vdiv_f32,  0b1000, 0, 0);
+  INSN(vdiv_f64,  0b1000, 0, 1);
+
+  INSN(vfnma_f32, 0b1001, 1, 0);
+  INSN(vfnma_f64, 0b1001, 1, 1);
+  INSN(vfnms_f32, 0b1001, 0, 0);
+  INSN(vfnms_f64, 0b1001, 0, 1);
+
+  INSN(vfma_f32,  0b1010, 0, 0);
+  INSN(vfma_f64,  0b1010, 0, 1);
+  INSN(vfms_f32,  0b1010, 1, 0);
+  INSN(vfms_f64,  0b1010, 1, 1);
+#undef INSN
+
+
+  void vmov_imm(FloatRegister Rd, unsigned imm, bool is64bit, Condition cond);
+  void vmov_imm(FloatRegister Rd, unsigned imm);
+  void vmov_imm_zero(FloatRegister Rd, bool is64bit, Condition cond);
+
+  unsigned encode_float_fp_imm(float imm_f);
+
+  void vmov_f32(FloatRegister Rd, float imm, Condition cond = C_DFLT) {
+    vmov_imm(Rd, encode_float_fp_imm(imm), false, cond);
+  }
+
+  unsigned encode_double_fp_imm(double imm_f);
+
+  void vmov_f64(FloatRegister Rd, double imm, Condition cond = C_DFLT) {
+    bool positive_zero = (imm == 0.0) && !signbit(imm);
+    if(positive_zero) vmov_imm_zero(Rd, true, cond);
+    else              vmov_imm(Rd, encode_double_fp_imm(imm), true, cond);
+  }
+
+#define INSN(NAME, decode, op, is64bit)                                              \
+  void NAME(FloatRegister Rd, FloatRegister Rm, Condition cond = C_DFLT) {           \
+    starti;                                                                          \
+    fp_instr_base(is64bit, cond);                                                    \
+    f(0b1011, 23, 20), f(decode, 19, 16), f(op, 7, 6), f(0b00, 5, 4);                \
+    /* double register passed (see 'd0'-'dN' encoding), not reencode it's number */  \
+    fp_rencode(Rd, false, 12, 22);                                                   \
+    fp_rencode(Rm, false, 0, 5);                                                     \
+  }
+  INSN(vmov_f32,  0b0000, 0b01, 0);
+  INSN(vmov_f64,  0b0000, 0b01, 1);
+  INSN(vabs_f32,  0b0000, 0b11, 0);
+  INSN(vabs_f64,  0b0000, 0b11, 1);
+  INSN(vneg_f32,  0b0001, 0b01, 0);
+  INSN(vneg_f64,  0b0001, 0b01, 1);
+  INSN(vsqrt_f32, 0b0001, 0b11, 0);
+  INSN(vsqrt_f64, 0b0001, 0b11, 1);
+#undef INSN
+
+//ARM -> FP, FP -> ARM
+// NOTE - Have only implemented the double precision variant as only operating on
+// double registers - can still be used to copy single precision
+void vmov64_instr_base(FloatRegister Rm, Register Rt, Register Rt2, int op,
+                       Condition cond) {
+  starti;
+  f(cond, 31, 28), f(0b1100010, 27, 21), f(op, 20);
+  rf(Rt2, 16), rf(Rt, 12), f(0b101100, 11, 6), f(1, 4);
+  // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+  fp_rencode(Rm, false, 0, 5);
+}
+
+void vmov_f64(FloatRegister Rm, Register Rt, Register Rt2, Condition cond = C_DFLT) {
+  vmov64_instr_base(Rm, Rt, Rt2, 0, cond);
+}
+void vmov_f64(Register Rt, Register Rt2, FloatRegister Rm, Condition cond = C_DFLT) {
+  vmov64_instr_base(Rm, Rt, Rt2, 1, cond);
+}
+
+void vmov_f32(FloatRegister Rn, Register Rt, Condition cond = C_DFLT) {
+  starti;
+  fp_instr_base(false, cond);
+  f(0b000, 23, 21), f(0, 20);
+  rf(Rt, 12), f(0b101000010000, 11, 0);
+  // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+  fp_rencode(Rn, false, 16, 7);
+}
+void vmov_f32(Register Rt, FloatRegister Rn, Condition cond = C_DFLT) {
+  starti;
+  fp_instr_base(false, cond);
+  f(0b000, 23, 21), f(1, 20);
+  rf(Rt, 12), f(0b101000010000, 11, 0);
+  // double register passed (see 'd0'-'dN' encoding), not reencode it's number
+  fp_rencode(Rn, false, 16, 7);
+}
+
+// Floating-point comparison
+#define INSN(NAME, E, is64bit)                                                       \
+  void NAME(FloatRegister Rd, int imm, Condition cond = C_DFLT) {                    \
+    assert(0 == imm, "vector compare can only be with another vector or zero");      \
+    starti;                                                                          \
+    fp_instr_base(is64bit, cond);                                                    \
+    f(0b10110101, 23, 16), f(E, 7), f(0b1000000, 6, 0);                              \
+    /* double register passed (see 'd0'-'dN' encoding), not reencode it's number */  \
+    fp_rencode(Rd, false, 12, 22);                                                   \
+  }                                                                                  \
+  void NAME(FloatRegister Vd, FloatRegister Vm, Condition cond = C_DFLT) {           \
+    starti;                                                                          \
+    fp_instr_base(is64bit, cond);                                                    \
+    f(0b10110100, 23, 16), f(E, 7), f(1, 6), f(0, 4);                                \
+    /* double register passed (see 'd0'-'dN' encoding), not reencode it's number */  \
+    fp_rencode(Vd, false, 12, 22), fp_rencode(Vm, false, 0, 5);                      \
+  }
+  INSN(vcmpe_f64, 1, 1);
+  INSN(vcmpe_f32, 1, 0);
+  INSN( vcmp_f64, 0, 1);
+  INSN( vcmp_f32, 0, 0);
+#undef INSN
+
+//Move FPSCR to ARM register
+void vmrs(Register Rt, Condition cond = C_DFLT) {
+  starti;
+  f(cond, 31, 28), f(0b111011110001, 27, 16), rf(Rt, 12), f(0b101000010000, 11, 0);
+}
+
+//Move ARM register to FPSCR
+void vmsr(Register Rt, Condition cond = C_DFLT) {
+  starti;
+  f(cond, 31, 28), f(0b111011100001, 27, 16), rf(Rt, 12), f(0b101000010000, 11, 0);
+}
+
+// TODO These instructions use round towards zero mode. It is possible
+//  for the mode to be taken from the FPSCR however it doesn't do it currently
+#define INSN(NAME, decode2, b19, op, is64bitRd, is64bitRm, sz)                       \
+  void NAME(FloatRegister Rd, FloatRegister Rm, Condition cond = C_DFLT) {           \
+    starti;                                                                          \
+    fp_instr_base(sz, cond);                                                         \
+    f(0b1011, 23, 20), f(b19, 19), f(decode2, 18, 16), f(op, 7), f(0b100, 6, 4);     \
+    /* double register passed (see 'd0'-'dN' encoding), not reencode it's number */  \
+    fp_rencode(Rd, false, 12, 22);                                                   \
+    fp_rencode(Rm, false, 0, 5);                                                     \
+  }
+  INSN(vcvt_s32_f32, 0b101, 1, 1, 0, 0, 0);
+  INSN(vcvt_s32_f64, 0b101, 1, 1, 0, 1, 1);
+  INSN(vcvt_u32_f32, 0b100, 1, 1, 0, 0, 0);
+  INSN(vcvt_u32_f64, 0b100, 1, 1, 0, 1, 1);
+
+  INSN(vcvt_f64_s32, 0b000, 1, 1, 1, 0, 1);
+  INSN(vcvt_f64_u32, 0b000, 1, 0, 1, 0, 1);
+  INSN(vcvt_f32_s32, 0b000, 1, 1, 0, 0, 0);
+  INSN(vcvt_f32_u32, 0b000, 1, 0, 0, 0, 0);
+
+  INSN(vcvt_f32_f64, 0b111, 0, 1, 0, 1, 1);
+  INSN(vcvt_f64_f32, 0b111, 0, 1, 1, 0, 0);
+#undef INSN
+
+//Vector load/store
+ private:
+  void fp_ldst_instr(int decode, bool is64bit, const Address& adr, Condition cond);
+ public:
+
+#define INSN(NAME, decode, is64bit)                                                  \
+  void NAME(FloatRegister Vd, const Address &adr, Condition cond = C_DFLT) {         \
+    starti;                                                                          \
+    fp_ldst_instr(decode, is64bit, adr, cond);                                       \
+    /* double register passed (see 'd0'-'dN' encoding), not reencode it's number */  \
+    fp_rencode(Vd, false, 12, 22);                                                   \
+  }                                                                                  \
+  void NAME(FloatRegister Vd, address dest, Condition cond = C_DFLT) {               \
+    long offset = dest - pc();                                                       \
+    NAME(Vd, Address(r15_pc, offset), cond);                                         \
+  }                                                                                  \
+  void NAME(FloatRegister Vd, address dest, relocInfo::relocType rtype,              \
+            Condition cond = C_DFLT) {                                               \
+    guarantee(rtype == relocInfo::internal_word_type,                                \
+              "only internal_word_type relocs make sense here");                     \
+    NAME(Vd, InternalAddress(dest), cond);                                           \
+  }                                                                                  \
+  void NAME(FloatRegister Vd, Label &L, Condition cond = C_DFLT) {                   \
+    wrap_label(Vd, L, cond, &Assembler::NAME);                                       \
+  }
+  INSN(vstr_f64, 0b10000, 1);
+  INSN(vstr_f32, 0b10000, 0);
+  INSN(vldr_f64, 0b10001, 1);
+  INSN(vldr_f32, 0b10001, 0);
+#undef INSN
+
+ private:
+  enum fp_mode { ia_wb, ia, db_wb };
+  void fp_ldst_mul(Register Rn, uint32_t regset, bool load, bool is64bit, enum fp_mode mode, Condition cond);
+ public:
+#define INSN(NAME, EXT, is64bit, load)                                               \
+  inline void NAME##ia##EXT(Register Rn, unsigned regset, bool wb = true,            \
+                            Condition cond = C_DFLT) {                               \
+    fp_ldst_mul(Rn, regset, load, is64bit,                                           \
+                (enum fp_mode)( ia_wb + ( wb?0:1 )), cond);                          \
+  }                                                                                  \
+  inline void NAME##db##EXT(Register Rn, unsigned regset, Condition cond = C_DFLT) { \
+    fp_ldst_mul(Rn, regset, load, is64bit, db_wb, cond);                             \
+  }
+  INSN(vldm, _f32, 0, 1);
+  INSN(vldm, _f64, 1, 1);
+  INSN(vstm, _f32, 0, 0);
+  INSN(vstm, _f64, 1, 0);
+#undef INSN
+
+#undef ZERO_ADDR_REG
+#undef ONES_ADDR_REG
+
+/* SIMD extensions
+ *
+ * We just use FloatRegister in the following. They are exactly the same
+ * as SIMD registers.
+ */
+ public:
+  enum SIMD_Align {
+    ALIGN_STD = 0b00, ALIGN_64 = 0b01, ALIGN_128 = 0b10, ALIGN_256 = 0b11
+  };
+  // multiple single elements
+private:
+  void simd_ldst(FloatRegister, unsigned type, unsigned size, unsigned xfer_size,
+          const Address &addr, enum SIMD_Align align, unsigned encode);
+public:
+#define INSN(NAME, size, encode)                                                     \
+  inline void NAME(FloatRegister Dd, const Address &addr, enum SIMD_Align align) {   \
+    simd_ldst(Dd, 0b0111, size, 1, addr, align, encode);                             \
+  }                                                                                  \
+  inline void NAME(FloatRegister Dd, FloatRegister Dd1, const Address &addr,         \
+    enum SIMD_Align align) {                                                         \
+    assert(Dd->successor(FloatRegisterImpl::DOUBLE) == Dd1, "Must be consecutive");  \
+    simd_ldst(Dd, 0b1010, size, 2, addr, align, encode);                             \
+  }                                                                                  \
+  inline void NAME(FloatRegister Dd, FloatRegister Dd1, FloatRegister Dd2,           \
+    const Address &addr, enum SIMD_Align align) {                                    \
+    assert(Dd->successor(FloatRegisterImpl::DOUBLE) == Dd1, "Must be consecutive");  \
+    assert(Dd1->successor(FloatRegisterImpl::DOUBLE) == Dd2, "Must be consecutive"); \
+    simd_ldst(Dd, 0b0110, size, 3, addr, align, encode);                             \
+  }                                                                                  \
+  inline void NAME(FloatRegister Dd, FloatRegister Dd1, FloatRegister Dd2,           \
+    FloatRegister Dd3, const Address &addr, enum SIMD_Align align) {                 \
+    assert(Dd->successor(FloatRegisterImpl::DOUBLE) == Dd1, "Must be consecutive");  \
+    assert(Dd1->successor(FloatRegisterImpl::DOUBLE) == Dd2, "Must be consecutive"); \
+    assert(Dd2->successor(FloatRegisterImpl::DOUBLE) == Dd3, "Must be consecutive"); \
+    simd_ldst(Dd, 0b0010, size, 4, addr, align, encode);                             \
+  }
+  INSN(vld1_8,  0b00, 0b10);
+  INSN(vld1_16, 0b01, 0b10);
+  INSN(vld1_32, 0b10, 0b10);
+  INSN(vld1_64, 0b11, 0b10);
+  INSN(vst1_8,  0b00, 0b00);
+  INSN(vst1_16, 0b01, 0b00);
+  INSN(vst1_32, 0b10, 0b00);
+  INSN(vst1_64, 0b11, 0b00);
+#undef INSN
+
+  // single element to one lane
+private:
+  void simd_ldst_single(FloatRegister Rd, unsigned size, unsigned index,
+        const Address &addr, bool align, unsigned encode);
+public:
+#define INSN(NAME, size, encode)                                                     \
+  inline void NAME(FloatRegister Dd, unsigned index, const Address &addr, bool align) { \
+    simd_ldst_single(Dd, size, index, addr, align, encode);                          \
+  }
+  INSN(vld1_8,  0b00, 0b10);
+  INSN(vld1_16, 0b01, 0b10);
+  INSN(vld1_32, 0b10, 0b10);
+  INSN(vst1_8,  0b00, 0b00);
+  INSN(vst1_16, 0b01, 0b00);
+  INSN(vst1_32, 0b10, 0b00);
+#undef INSN
+
+private:
+  void simd_vmov(FloatRegister Dd, unsigned index, Register Rt, bool advsimd,
+          unsigned index_bits, unsigned bit20, unsigned opc, Condition cond);
+public:
+#define INSN(NAME, advsimd, opc, index_bits)                                         \
+  inline void NAME(FloatRegister Rd, unsigned index, Register Rt,                    \
+                                  Condition cond = Assembler::AL) {                  \
+    simd_vmov(Rd, index, Rt, advsimd, index_bits, 0, opc, cond);                     \
+  }
+  INSN(vmov_8,  true, 0b1000, 2);
+  INSN(vmov_16, true, 0b0001, 1);
+  INSN(vmov_32, false, 0b0000, 0);
+#undef INSN
+#define INSN(NAME, advsimd, opc, index_bits)                                         \
+  inline void NAME(Register Rt, FloatRegister Rd, unsigned index,                    \
+                                  Condition cond = Assembler::AL) {                  \
+    simd_vmov(Rd, index, Rt, advsimd, index_bits, 1, opc, cond);                     \
+  }
+  INSN(vmov_8s,  true, 0b01000, 3);
+  INSN(vmov_16s, true, 0b00001, 2);
+  INSN(vmov_8u,  true, 0b11000, 3);
+  INSN(vmov_16u, true, 0b10001, 2);
+  INSN(vmov_32,  false, 0b00000, 1);
+#undef INSN
+
+private:
+  void simd_vmov(FloatRegister Dd, unsigned imm, unsigned q, unsigned op_cmode);
+public:
+#define INSN(NAME, q, op_cmode)                                                      \
+  inline void NAME(FloatRegister Dd, unsigned imm) {                                 \
+    simd_vmov(Dd, imm, q, op_cmode);                                                 \
+  }
+  INSN(vmov_64_8,  0, 0b01110);
+  INSN(vmov_64_16, 0, 0b01000);
+  INSN(vmov_64_32, 0, 0b00000);
+  INSN(vmov_128_8,  1, 0b01110);
+  INSN(vmov_128_16, 1, 0b01000);
+  INSN(vmov_128_32, 1, 0b00000);
+#undef INSN
+
+private:
+  void simd_logicalop(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, unsigned q,
+        unsigned a, unsigned b, unsigned u, unsigned c);
+public:
+#define INSN(NAME, q, a, b, u, c)                                                    \
+  inline void NAME(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm) {           \
+    simd_logicalop(Dd, Dn, Dm, q, a, b, u, c);                                       \
+  }
+  INSN(veor_64,  0, 0b0001, 1, 1, 0b00);
+  INSN(veor_128, 1, 0b0001, 1, 1, 0b00);
+  INSN(vand_64,  0, 0b0001, 1, 0, 0b00);
+  INSN(vand_128, 1, 0b0001, 1, 0, 0b00);
+  INSN(vorr_64,  0, 0b0001, 1, 0, 0b10);
+  INSN(vorr_128, 1, 0b0001, 1, 0, 0b10);
+#undef INSN
+
+  // vmov is actually a vorr
+#define vmov_64(Dd, Dm) vorr_64(Dd, Dm, Dm)
+#define vmov_128(Qd, Qm) vorr_128(Qd, Qm, Qm)
+
+private:
+  void simd_vmul(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+          unsigned bit24, unsigned bits109, unsigned size, unsigned mul, unsigned bit6);
+public:
+#define INSN(NAME, bit24, bit9, size, mul, bit6, bit10)                              \
+  inline void NAME(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm) {           \
+    simd_vmul(Dd, Dn, Dm, bit24, (bit10<<1)|bit9, size, mul, bit6);                  \
+  }
+  INSN(vmul_64_8,   0, 0, 0b00, 1, 0, 0);
+  INSN(vmul_64_16,  0, 0, 0b01, 1, 0, 0);
+  INSN(vmul_64_32,  0, 0, 0b10, 1, 0, 0);
+  INSN(vmulp_64_8,  1, 0, 0b00, 1, 0, 0);
+  INSN(vmul_128_8,  0, 0, 0b00, 1, 1, 0);
+  INSN(vmul_128_16, 0, 0, 0b01, 1, 1, 0);
+  INSN(vmul_128_32, 0, 0, 0b10, 1, 1, 0);
+  INSN(vmulp_128_8, 1, 0, 0b00, 1, 1, 0);
+  INSN(vmull_8s,    0, 0, 0b00, 0, 0, 1);
+  INSN(vmull_16s,   0, 0, 0b01, 0, 0, 1);
+  INSN(vmull_32s,   0, 0, 0b10, 0, 0, 1);
+  INSN(vmull_8u,    1, 0, 0b00, 0, 0, 1);
+  INSN(vmull_16u,   1, 0, 0b01, 0, 0, 1);
+  INSN(vmull_32u,   1, 0, 0b10, 0, 0, 1);
+  INSN(vmullp_8,    0, 1, 0b00, 0, 0, 1);
+  INSN(vmul_64_f32, 1, 0, 0b00, 1, 0, 1);
+  INSN(vmul_128_f32,1, 0, 0b00, 1, 1, 1);
+#undef INSN
+
+private:
+  void simd_vuzp(FloatRegister Dd, FloatRegister Dm, unsigned size, unsigned q);
+public:
+#define INSN(NAME, size, q)                                                          \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                             \
+    simd_vuzp(Dd, Dm, size, q);                                                      \
+  }
+  INSN(vuzp_64_8,   0b00, 0);
+  INSN(vuzp_64_16,  0b01, 0);
+  INSN(vuzp_64_32,  0b10, 0);
+  INSN(vuzp_128_8,  0b00, 1);
+  INSN(vuzp_128_16, 0b01, 1);
+  INSN(vuzp_128_32, 0b10, 1);
+#undef INSN
+
+private:
+  void simd_vshl(FloatRegister Dd, FloatRegister Dm, unsigned imm,
+          unsigned q, unsigned u, unsigned encode);
+public:
+#define INSN(NAME, size, q, u, encode, checkDd)                                      \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm, unsigned imm) {               \
+    assert(!checkDd || (Dd->encoding() & 2) == 0, "Odd register");                   \
+    unsigned encode_eff = encode;                                                    \
+    unsigned u_eff = u;                                                              \
+    imm &= size == 6 ? 0x3f : 0x1f; /* per jvms */                                   \
+    if (imm >= (1u << size)) { /* vshl cannot encode shift by size or more... */     \
+      encode_eff = 0b0000; /* .. change to equivalent vshr (actually set to 0) */    \
+      u_eff = 1;                                                                     \
+      imm = (1u << size);                                                            \
+    }                                                                                \
+    simd_vshl(Dd, Dm, imm|(1u<<size), q, u_eff, encode_eff);                         \
+  }
+  INSN(vshl_64_8,   3, 0, 0, 0b0101, false);
+  INSN(vshl_64_16,  4, 0, 0, 0b0101, false);
+  INSN(vshl_64_32,  5, 0, 0, 0b0101, false);
+  INSN(vshl_64_64,  6, 0, 0, 0b0101, false);
+  INSN(vshl_128_8,  3, 1, 0, 0b0101, false);
+  INSN(vshl_128_16, 4, 1, 0, 0b0101, false);
+  INSN(vshl_128_32, 5, 1, 0, 0b0101, false);
+  INSN(vshl_128_64, 6, 1, 0, 0b0101, false);
+  INSN(vshll_8s,    3, 0, 0, 0b1010, true);
+  INSN(vshll_8u,    3, 0, 1, 0b1010, true);
+  INSN(vshll_16s,   4, 0, 0, 0b1010, true);
+  INSN(vshll_16u,   4, 0, 1, 0b1010, true);
+  INSN(vshll_32s,   5, 0, 0, 0b1010, true);
+  INSN(vshll_32u,   5, 0, 1, 0b1010, true);
+#undef INSN
+#define INSN(NAME, size, q, u)                                                       \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm, unsigned imm) {               \
+    unsigned encode_eff = 0b0000;                                                    \
+    imm &= size == 6 ? 0x3f : 0x1f; /* per jvms */                                   \
+    if (imm == 0) { /* vshr cannot encode shift by 0... */                           \
+      encode_eff = 0b0101; /* ... change equivalent vshl */                          \
+      imm = 1u << size;                                                              \
+    } else if (imm > (1u << size)) {                                                 \
+      imm = 1u << size; /* saturate shift */                                         \
+    } else { /* encode the imm per ARM spec */                                       \
+      imm = (1u << size+1) - imm;                                                    \
+    }                                                                                \
+    simd_vshl(Dd, Dm, imm, q, u, encode_eff);                                        \
+  }
+  INSN(vshr_64_u8,   3, 0, 1);
+  INSN(vshr_64_u16,  4, 0, 1);
+  INSN(vshr_64_u32,  5, 0, 1);
+  INSN(vshr_64_u64,  6, 0, 1);
+  INSN(vshr_128_u8,  3, 1, 1);
+  INSN(vshr_128_u16, 4, 1, 1);
+  INSN(vshr_128_u32, 5, 1, 1);
+  INSN(vshr_128_u64, 6, 1, 1);
+  INSN(vshr_64_s8,   3, 0, 0);
+  INSN(vshr_64_s16,  4, 0, 0);
+  INSN(vshr_64_s32,  5, 0, 0);
+  INSN(vshr_64_s64,  6, 0, 0);
+  INSN(vshr_128_s8,  3, 1, 0);
+  INSN(vshr_128_s16, 4, 1, 0);
+  INSN(vshr_128_s32, 5, 1, 0);
+  INSN(vshr_128_s64, 6, 1, 0);
+#undef INSN
+#define INSN(NAME, encode, size, q)                                            \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm, unsigned imm) {         \
+    simd_vshl(Dd, Dm, imm|(1u<<size), q, 1, encode);                           \
+  }
+#define INSN_GR(NAME, U, encode, u)                                            \
+  INSN(NAME##_64##U##8,   encode, 3, 0);                                       \
+  INSN(NAME##_64##U##16,  encode, 4, 0);                                       \
+  INSN(NAME##_64##U##32,  encode, 5, 0);                                       \
+  INSN(NAME##_64##U##64,  encode, 6, 0);                                       \
+  INSN(NAME##_128##U##8,  encode, 3, 1);                                       \
+  INSN(NAME##_128##U##16, encode, 4, 2);                                       \
+  INSN(NAME##_128##U##32, encode, 5, 3);                                       \
+  INSN(NAME##_128##U##64, encode, 6, 4);
+
+  INSN_GR(vsli, _, 0b0101, 1);
+  INSN_GR(vsri, _, 0b0100, 1);
+  INSN_GR(vsra, _u, 0b0001, 1);
+  INSN_GR(vsra, _s, 0b0001, 0);
+#undef INSN_GR
+#undef INSN
+
+#define vmovl_8s(Qd, Dm)  vshll_8s(Qd, Dm, 0);
+#define vmovl_16s(Qd, Dm) vshll_16s(Qd, Dm, 0);
+#define vmovl_32s(Qd, Dm) vshll_32s(Qd, Dm, 0);
+#define vmovl_8u(Qd, Dm)  vshll_8u(Qd, Dm, 0);
+#define vmovl_16u(Qd, Dm) vshll_16u(Qd, Dm, 0);
+#define vmovl_32u(Qd, Dm) vshll_32u(Qd, Dm, 0);
+
+private:
+  void simd_vshl(FloatRegister Dd, FloatRegister Dm, FloatRegister Dn, unsigned size,
+          unsigned q, unsigned u);
+public:
+#define INSN(NAME, size, q, u)                                                       \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm, FloatRegister Dn) {           \
+    simd_vshl(Dd, Dm, Dn, size, q, u);                                               \
+  }
+  INSN(vshl_64_u8,   0, 0, 1);
+  INSN(vshl_64_u16,  1, 0, 1);
+  INSN(vshl_64_u32,  2, 0, 1);
+  INSN(vshl_64_u64,  3, 0, 1);
+  INSN(vshl_128_u8,  0, 1, 1);
+  INSN(vshl_128_u16, 1, 1, 1);
+  INSN(vshl_128_u32, 2, 1, 1);
+  INSN(vshl_128_u64, 3, 1, 1);
+  INSN(vshl_64_s8,   0, 0, 0);
+  INSN(vshl_64_s16,  1, 0, 0);
+  INSN(vshl_64_s32,  2, 0, 0);
+  INSN(vshl_64_s64,  3, 0, 0);
+  INSN(vshl_128_s8,  0, 1, 0);
+  INSN(vshl_128_s16, 1, 1, 0);
+  INSN(vshl_128_s32, 2, 1, 0);
+  INSN(vshl_128_s64, 3, 1, 0);
+#undef INSN
+
+  // Two registers, miscellaneous
+private:
+  void simd_insn(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned a,
+          unsigned b, unsigned size);
+public:
+#define INSN(NAME, q, size, op, a)                                             \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                       \
+    simd_insn(Dd, Dm, q, a, q|(op<<1), size);                                  \
+  }
+  INSN(vrev16_64_8,    0, 0, 2, 0b00);
+  INSN(vrev16_128_8,   1, 0, 2, 0b00);
+  INSN(vrev32_64_8,    0, 0, 1, 0b00);
+  INSN(vrev32_128_8,   1, 0, 1, 0b00);
+  INSN(vrev32_64_16,   0, 1, 1, 0b00);
+  INSN(vrev32_128_16,  1, 1, 1, 0b00);
+  INSN(vrev64_64_8,    0, 0, 0, 0b00);
+  INSN(vrev64_128_8,   1, 0, 0, 0b00);
+  INSN(vrev64_64_16,   0, 1, 0, 0b00);
+  INSN(vrev64_128_16,  1, 1, 0, 0b00);
+  INSN(vrev64_64_32,   0, 2, 0, 0b00);
+  INSN(vrev64_128_32,  1, 2, 0, 0b00);
+#undef INSN
+#define INSN(NAME, q, a, b, size)                                              \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                       \
+    simd_insn(Dd, Dm, q, a, (b<<1)|q, size);                                   \
+  }
+#define INSN_GR(NAME, a, b)                                                    \
+  INSN(NAME##_64_8, 0, a, b, 0b00);                                            \
+  INSN(NAME##_64_16, 0, a, b, 0b01);                                           \
+  INSN(NAME##_64_32, 0, a, b, 0b10);                                           \
+  INSN(NAME##_128_8, 1, a, b, 0b00);                                           \
+  INSN(NAME##_128_16, 1, a, b, 0b01);                                          \
+  INSN(NAME##_128_32, 1, a, b, 0b10);
+
+  INSN_GR(vtrn, 0b10, 0b0001);
+  INSN(vswp_64, 0, 0b10, 0b0000, 0b00);
+  INSN(vswp_128, 1, 0b10, 0b0000, 0b00);
+#undef INSN_GR
+#undef INSN
+
+private:
+  void simd_cnt(FloatRegister Dd, FloatRegister Dm, unsigned q);
+public:
+#define INSN(NAME, q)                                                                \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                             \
+    simd_cnt(Dd, Dm, q);                                                             \
+  }
+  INSN(vcnt_64, 0);
+  INSN(vcnt_128, 1);
+#undef INSN
+
+private:
+  void simd_padl(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned size,
+          unsigned op, unsigned encode);
+public:
+#define INSN(NAME, q, size, op, encode)                                              \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                             \
+    simd_padl(Dd, Dm, q, size, op, encode);                                          \
+  }
+  INSN(vpadal_64_s8,   0, 0, 0, 0b110);
+  INSN(vpadal_64_s16,  0, 1, 0, 0b110);
+  INSN(vpadal_64_s32,  0, 2, 0, 0b110);
+  INSN(vpadal_64_u8,   0, 0, 1, 0b110);
+  INSN(vpadal_64_u16,  0, 1, 1, 0b110);
+  INSN(vpadal_64_u32,  0, 2, 1, 0b110);
+  INSN(vpadal_128_s8,  1, 0, 0, 0b110);
+  INSN(vpadal_128_s16, 1, 1, 0, 0b110);
+  INSN(vpadal_128_s32, 1, 2, 0, 0b110);
+  INSN(vpadal_128_u8,  1, 0, 1, 0b110);
+  INSN(vpadal_128_u16, 1, 1, 1, 0b110);
+  INSN(vpadal_128_u32, 1, 2, 1, 0b110);
+  INSN(vpaddl_64_s8,   0, 0, 0, 0b010);
+  INSN(vpaddl_64_s16,  0, 1, 0, 0b010);
+  INSN(vpaddl_64_s32,  0, 2, 0, 0b010);
+  INSN(vpaddl_64_u8,   0, 0, 1, 0b010);
+  INSN(vpaddl_64_u16,  0, 1, 1, 0b010);
+  INSN(vpaddl_64_u32,  0, 2, 1, 0b010);
+  INSN(vpaddl_128_s8,  1, 0, 0, 0b010);
+  INSN(vpaddl_128_s16, 1, 1, 0, 0b010);
+  INSN(vpaddl_128_s32, 1, 2, 0, 0b010);
+  INSN(vpaddl_128_u8,  1, 0, 1, 0b010);
+  INSN(vpaddl_128_u16, 1, 1, 1, 0b010);
+  INSN(vpaddl_128_u32, 1, 2, 1, 0b010);
+#undef INSN
+
+private:
+  void simd_dup(FloatRegister Dd, Register Rt, unsigned q, unsigned size);
+  void simd_dup(FloatRegister Dd, FloatRegister Dm, unsigned index, unsigned q, unsigned size);
+public:
+#define INSN(NAME, q, size)                                                          \
+  inline void NAME(FloatRegister Dd, Register Rt) {                                  \
+    simd_dup(Dd, Rt, q, size);                                                       \
+  }
+  INSN(vdup_64_8, 0, 0b10);
+  INSN(vdup_64_16, 0, 0b01);
+  INSN(vdup_64_32, 0, 0b00);
+  INSN(vdup_128_8, 1, 0b10);
+  INSN(vdup_128_16, 1, 0b01);
+  INSN(vdup_128_32, 1, 0b00);
+#undef INSN
+public:
+#define INSN(NAME, q, size)                                                          \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm, unsigned index) {             \
+    simd_dup(Dd, Dm, index, q, size);                                                \
+  }
+  INSN(vdup_64_8, 0, 0b10);
+  INSN(vdup_64_16, 0, 0b01);
+  INSN(vdup_64_32, 0, 0b00);
+  INSN(vdup_128_8, 1, 0b10);
+  INSN(vdup_128_16, 1, 0b01);
+  INSN(vdup_128_32, 1, 0b00);
+#undef INSN
+#define INSN(NAME, q)                                                                \
+  inline void NAME(FloatRegister Dd, FloatRegister Sm) {                             \
+    const int m_num = Sm->encoding_nocheck();                                        \
+    simd_dup(Dd, as_FloatRegister(m_num & ~1), m_num & 1, q, 0b00);                  \
+  }
+  INSN(vdups_64, 0);
+  INSN(vdups_128, 1);
+#undef INSN
+
+private:
+  void simd_neg(FloatRegister Dd, FloatRegister Dm, unsigned q, unsigned size);
+public:
+#define INSN(NAME, q, size)                                                          \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                             \
+    simd_neg(Dd, Dm, q, size);                                                       \
+  }
+  INSN(vneg_64_s8, 0, 0b00);
+  INSN(vneg_64_s16, 0, 0b01);
+  INSN(vneg_64_s32, 0, 0b10);
+  INSN(vneg_128_s8, 1, 0b00);
+  INSN(vneg_128_s16, 1, 0b01);
+  INSN(vneg_128_s32, 1, 0b10);
+#undef INSN
+
+private:
+  void simd_mvn(FloatRegister Dd, FloatRegister Dm, unsigned q);
+public:
+#define INSN(NAME, q)                                                          \
+  inline void NAME(FloatRegister Dd, FloatRegister Dm) {                       \
+    simd_mvn(Dd, Dm, q);                                                       \
+  }
+  INSN(vmvn_64, 0);
+  INSN(vmvn_128, 1);
+#undef INSN
+
+  // three registers of the same length
+private:
+  void simd_insn(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+          unsigned q, unsigned a, unsigned b, unsigned u, unsigned c);
+public:
+#define INSN(NAME, q, a, b, u, c) \
+  inline void NAME(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm) {     \
+    simd_insn(Dd, Dn, Dm, q, a, b, u, c); \
+  }
+#define INSN_GR(NAME, a, b, u) \
+  INSN(NAME##_64_8, 0, a, b, u, 0b00) \
+  INSN(NAME##_64_16, 0, a, b, u, 0b01) \
+  INSN(NAME##_64_32, 0, a, b, u, 0b10) \
+  INSN(NAME##_64_64, 0, a, b, u, 0b11) \
+  INSN(NAME##_128_8, 1, a, b, u, 0b00) \
+  INSN(NAME##_128_16, 1, a, b, u, 0b01) \
+  INSN(NAME##_128_32, 1, a, b, u, 0b10) \
+  INSN(NAME##_128_64, 1, a, b, u, 0b11)
+
+  INSN_GR(vadd, 0b1000, 0b0, 0b0);
+  INSN(vadd_64_f32, 0, 0b1101, 0b0, 0b0, 0b00);
+  INSN(vadd_128_f32, 1, 0b1101, 0b0, 0b0, 0b00);
+  INSN_GR(vsub, 0b1000, 0b0, 0b1);
+  INSN(vsub_64_f32, 0, 0b1101, 0b0, 0b0, 0b10);
+  INSN(vsub_128_f32, 1, 0b1101, 0b0, 0b0, 0b10);
+
+#undef INSN_GR
+#undef INSN
+
+  // three registers of different length
+private:
+  void simd_insn(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm,
+          unsigned qn, unsigned a, unsigned b, unsigned u);
+public:
+#define INSN(NAME, qn, a, b, u)                                                \
+  inline void NAME(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm) {     \
+    simd_insn(Dd, Dn, Dm, qn, a, b, u);                                        \
+  }
+#define INSN_GR(NAME, qn, a)                                                   \
+  INSN(NAME##_8u, qn, a, 0b00, 1)                                              \
+  INSN(NAME##_16u, qn, a, 0b01, 1)                                             \
+  INSN(NAME##_32u, qn, a, 0b10, 1)                                             \
+  INSN(NAME##_8s, qn, a, 0b00, 0)                                              \
+  INSN(NAME##_16s, qn, a, 0b01, 0)                                             \
+  INSN(NAME##_32s, qn, a, 0b10, 0)
+
+  INSN_GR(vaddw, 1, 0b0001);
+  INSN_GR(vaddl, 0, 0b0000);
+#undef INSN_GR
+#undef INSN
+
+  // VEXT, the instruction out of any class
+private:
+  void simd_vext(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, unsigned q, unsigned imm);
+public:
+#define INSN(NAME, q)                                                                    \
+  inline void NAME(FloatRegister Dd, FloatRegister Dn, FloatRegister Dm, unsigned imm) { \
+    simd_vext(Dd, Dn, Dm, q, imm);                                                       \
+  }
+  INSN(vext_64, 0u);
+  INSN(vext_128, 1u);
+#undef INSN
+
+public:
+
+#define INSN(NAME, r)                                                                \
+  inline void NAME(Address a) {                                                      \
+    starti;                                                                          \
+    f(0b1111, 31, 28);                                                               \
+    f(0b0101, 27, 24), f(0b01, 21, 20);                                              \
+    f(0b1111, 15, 12);                                                               \
+    f(r, 22);                                                                        \
+    rf(a.base(), 16);                                                                \
+    if (a.get_mode() == Address::imm) {                                              \
+      f(a.offset() >= 0 ? 1 : 0, 23);                                                \
+      f(a.offset() >= 0 ? a.offset() : -a.offset(), 11, 0);                          \
+    } else if (a.get_mode() == Address::reg) {                                       \
+      assert(a.get_wb_mode() == Address::off, "must be");                            \
+      assert(!a.shift().is_register(), "must be");                                   \
+      f(a.op() == Address::ADD ? 1 : 0, 23);                                         \
+      rf(a.index(), 0);                                                              \
+      f(0, 4);                                                                       \
+      f(a.shift().shift(), 11, 7);                                                   \
+      f(a.shift().kind(), 6, 5);                                                     \
+    } else {                                                                         \
+    ShouldNotReachHere();                                                            \
+    }                                                                                \
+  }
+  INSN(pld,  1);
+  INSN(pldw, 0);
+#undef INSN
+
+#define INSN(NAME, size, c)                                                                 \
+  inline void NAME(Register Rd, Register Rn, Register Rm, Condition cond = C_DFLT) {        \
+    starti;                                                                                 \
+    assert(VM_Version::features() & FT_CRC32, "Instruction is not supported by CPU");       \
+    f(cond, 31, 28), f(0b00010, 27, 23), f(size, 22, 21), f(0, 20), rf(Rn, 16), rf(Rd, 12); \
+    f(0b00, 11, 10), c ? f(0b1, 9) : f(0b0, 9), f(0b00100, 8, 4), rf(Rm, 0);                \
+  }
+  INSN(crc32b,  0, 0);
+  INSN(crc32h,  1, 0);
+  INSN(crc32w,  2, 0);
+  INSN(crc32cb, 0, 1);
+  INSN(crc32ch, 1, 1);
+  INSN(crc32cw, 2, 1);
+#undef INSN
+
+#define INSN(NAME, opc)                                                        \
+  inline void NAME(FloatRegister Vd, FloatRegister Vm) {                       \
+    starti;                                                                    \
+    f(0b111100111, 31, 23), f(0b110000, 21, 16), f(0, 4);                      \
+    f(opc, 11, 6), fp_rencode(Vd, false, 12, 22), fp_rencode(Vm, false, 0, 5); \
+  }
+
+  INSN(aese, 0b001100);
+  INSN(aesd, 0b001101);
+  INSN(aesmc, 0b001110);
+  INSN(aesimc, 0b001111);
+
+#undef INSN
+
+  Assembler(CodeBuffer* code) : AbstractAssembler(code) {}
+
+  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
+                                                Register tmp,
+                                                int offset) {
+    ShouldNotCallThis();
+    return RegisterOrConstant();
+  }
+
+  // Stack overflow checking
+  virtual void bang_stack_with_offset(int offset);
+
+  // Immediate values checks and transformations
+
+  static uint32_t encode_imm12(int imm);
+  static int decode_imm12(uint32_t imm12);
+  static bool is_valid_for_imm12(int imm);
+
+  static bool is_valid_for_offset_imm(int imm, int nbits) {
+    return uabs(imm) < (1u << nbits);
+  }
+
+  static bool operand_valid_for_logical_immediate(bool is32, uint64_t imm);
+  static bool operand_valid_for_add_sub_immediate(int imm);
+  static bool operand_valid_for_add_sub_immediate(unsigned imm);
+  static bool operand_valid_for_add_sub_immediate(unsigned long imm);
+  static bool operand_valid_for_add_sub_immediate(jlong imm);
+  static bool operand_valid_for_float_immediate(float imm);
+  static bool operand_valid_for_double_immediate(double imm);
+
+  void emit_data64(jlong data, relocInfo::relocType rtype, int format = 0);
+  void emit_data64(jlong data, RelocationHolder const& rspec, int format = 0);
+
+  // useful to revert back the effect of post/pre addressing modifications
+  // applied to the base register
+  void compensate_addr_offset(const Address &adr, Condition cond) {
+    compensate_addr_offset(adr.base(), adr.index(), adr.shift(), adr.op() == Address::ADD, cond);
+  }
+  void compensate_addr_offset(Register Rd, Register Roff, shift_op shift, bool isAdd, Condition cond) {
+    shift_op shift_back;
+
+    if (shift.is_register()) {
+      switch (shift.kind()) {
+        case shift_op::LSL:
+        case shift_op::LSR:
+          shift_back = asr(shift.reg());
+          break;
+        case shift_op::ASR:
+          shift_back = lsl(shift.reg());
+          break;
+        case shift_op::ROR:
+          Unimplemented(); // need a temp register here
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    } else {
+      switch (shift.kind()) {
+        case shift_op::LSL:
+        case shift_op::LSR:
+          shift_back = asr(shift.shift());
+          break;
+        case shift_op::ASR:
+          shift_back = lsl(shift.shift());
+          break;
+        case shift_op::ROR:
+          shift_back = ror(32-shift.shift());
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    }
+    if (isAdd)
+      sub(Rd, Rd, Roff, shift_back, cond);
+    else
+      add(Rd, Rd, Roff, shift_back, cond);
+  }
+};
+
+inline Assembler::Membar_mask_bits operator|(Assembler::Membar_mask_bits a,
+                                             Assembler::Membar_mask_bits b) {
+  return Assembler::Membar_mask_bits(unsigned(a)|unsigned(b));
+}
+
+Instruction_aarch32::~Instruction_aarch32() {
+  assem->emit();
+}
+
+#undef starti
+
+// Invert a condition
+inline const Assembler::Condition operator~(const Assembler::Condition cond) {
+  return Assembler::Condition(int(cond) ^ 1);
+}
+
+class BiasedLockingCounters;
+
+extern "C" void das(uint64_t start, int len);
+
+#endif // CPU_AARCH32_VM_ASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:12.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/assembler_aarch32.inline.hpp	2018-09-25 19:24:12.000000000 +0300
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_ASSEMBLER_AARCH32_INLINE_HPP
+#define CPU_AARCH32_VM_ASSEMBLER_AARCH32_INLINE_HPP
+
+#include "asm/assembler.inline.hpp"
+#include "asm/codeBuffer.hpp"
+#include "code/codeCache.hpp"
+
+#endif // CPU_AARCH32_VM_ASSEMBLER_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:24:13.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/bytecodes_aarch32.cpp	2018-09-25 19:24:13.000000000 +0300
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/bytecodes.hpp"
+
--- /dev/null	2018-09-25 19:24:14.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/bytecodes_aarch32.hpp	2018-09-25 19:24:14.000000000 +0300
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_BYTECODES_AARCH32_HPP
+#define CPU_AARCH32_VM_BYTECODES_AARCH32_HPP
+
+// No AArch32 specific bytecodes
+
+#endif // CPU_AARCH32_VM_BYTECODES_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:15.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/bytes_aarch32.hpp	2018-09-25 19:24:15.000000000 +0300
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_BYTES_AARCH32_HPP
+#define CPU_AARCH32_VM_BYTES_AARCH32_HPP
+
+#include "memory/allocation.hpp"
+
+class Bytes: AllStatic {
+ public:
+
+  // Efficient reading and writing of unaligned unsigned data in platform-specific byte ordering.
+  // Since ARMv6 unaligned short and word accesses are handled by hardware.
+  // However, unaligned double-word access causes kernel trap and software processing,
+  // so we turn it to fast unalinged word access.
+  static inline u2   get_native_u2(address p)         { return *(u2*)p; }
+  static inline u4   get_native_u4(address p)         { return *(u4*)p; }
+  static inline u8   get_native_u8(address p)         {
+    if (!(uintptr_t(p) & 3)) {
+      return *(u8*)p;
+    }
+    u4 *const a = (u4*) p;
+    return (u8(a[1]) << 32) | a[0];
+  }
+
+  static inline void put_native_u2(address p, u2 x)   { *(u2*)p = x; }
+  static inline void put_native_u4(address p, u4 x)   { *(u4*)p = x; }
+  static inline void put_native_u8(address p, u8 x)   { *(u8*)p = x; }
+
+
+  // Efficient reading and writing of unaligned unsigned data in Java
+  // byte ordering (i.e. big-endian ordering). Byte-order reversal is
+  // needed since AArch32 use little-endian format.
+  static inline u2   get_Java_u2(address p)           { return swap_u2(get_native_u2(p)); }
+  static inline u4   get_Java_u4(address p)           { return swap_u4(get_native_u4(p)); }
+  static inline u8   get_Java_u8(address p)           { return swap_u8(get_native_u8(p)); }
+
+  static inline void put_Java_u2(address p, u2 x)     { put_native_u2(p, swap_u2(x)); }
+  static inline void put_Java_u4(address p, u4 x)     { put_native_u4(p, swap_u4(x)); }
+  static inline void put_Java_u8(address p, u8 x)     {
+    const u8 nx = swap_u8(x);
+    if (!(uintptr_t(p) & 3)) {
+      *(u8*)p = nx;
+    } else {
+      u4 *const a = (u4*) p;
+      a[0] = nx;
+      a[1] = nx >> 32;
+    }
+  }
+
+  // Efficient swapping of byte ordering
+  static inline u2   swap_u2(u2 x);                   // compiler-dependent implementation
+  static inline u4   swap_u4(u4 x);                   // compiler-dependent implementation
+  static inline u8   swap_u8(u8 x);
+};
+
+
+// The following header contains the implementations of swap_u2, swap_u4, and swap_u8[_base]
+
+#include OS_CPU_HEADER_INLINE(bytes)
+
+#endif // CPU_AARCH32_VM_BYTES_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:16.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_CodeStubs_aarch32.cpp	2018-09-25 19:24:16.000000000 +0300
@@ -0,0 +1,531 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+
+#define __ ce->masm()->
+
+#define should_not_reach_here() should_not_reach_here_line(__FILE__, __LINE__)
+
+void CounterOverflowStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  Metadata *m = _method->as_constant_ptr()->as_metadata();
+  __ mov_metadata(rscratch1, m);
+  ce->store_parameter(rscratch1, 1);
+  ce->store_parameter(_bci, 0);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::counter_overflow_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+RangeCheckStub::RangeCheckStub(CodeEmitInfo* info, LIR_Opr index, LIR_Opr array)
+  : _throw_index_out_of_bounds_exception(false), _index(index), _array(array) {
+  assert(info != NULL, "must have info");
+  _info = new CodeEmitInfo(info);
+}
+
+RangeCheckStub::RangeCheckStub(CodeEmitInfo* info, LIR_Opr index)
+  : _throw_index_out_of_bounds_exception(true), _index(index), _array(NULL) {
+  assert(info != NULL, "must have info");
+  _info = new CodeEmitInfo(info);
+}
+
+void RangeCheckStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  if (_info->deoptimize_on_exception()) {
+    address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+    __ far_call(RuntimeAddress(a));
+    ce->add_call_info_here(_info);
+    ce->verify_oop_map(_info);
+    debug_only(__ should_not_reach_here());
+    return;
+  }
+
+  if (_index->is_cpu_register()) {
+    __ mov(rscratch1, _index->as_register());
+  } else {
+    __ mov(rscratch1, _index->as_jint());
+  }
+  Runtime1::StubID stub_id;
+  if (_throw_index_out_of_bounds_exception) {
+    stub_id = Runtime1::throw_index_exception_id;
+  } else {
+    assert(_array != NULL, "sanity");
+    __ mov(rscratch2, _array->as_pointer_register());
+    stub_id = Runtime1::throw_range_check_failed_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(stub_id)), NULL);
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+PredicateFailedStub::PredicateFailedStub(CodeEmitInfo* info) {
+  _info = new CodeEmitInfo(info);
+}
+
+void PredicateFailedStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  address a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+  __ far_call(RuntimeAddress(a));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+void DivByZeroStub::emit_code(LIR_Assembler* ce) {
+  if (_offset != -1) {
+    ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  }
+  __ bind(_entry);
+  __ far_call(Address(Runtime1::entry_for(Runtime1::throw_div0_exception_id), relocInfo::runtime_call_type));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+#ifdef ASSERT
+  __ should_not_reach_here();
+#endif
+}
+
+
+
+// Implementation of NewInstanceStub
+
+NewInstanceStub::NewInstanceStub(LIR_Opr klass_reg, LIR_Opr result, ciInstanceKlass* klass, CodeEmitInfo* info, Runtime1::StubID stub_id) {
+  _result = result;
+  _klass = klass;
+  _klass_reg = klass_reg;
+  _info = new CodeEmitInfo(info);
+  assert(stub_id == Runtime1::new_instance_id                 ||
+         stub_id == Runtime1::fast_new_instance_id            ||
+         stub_id == Runtime1::fast_new_instance_init_check_id,
+         "need new_instance id");
+  _stub_id   = stub_id;
+}
+
+
+
+void NewInstanceStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  __ mov(r3, _klass_reg->as_register());
+  __ far_call(RuntimeAddress(Runtime1::entry_for(_stub_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0,");
+  __ b(_continuation);
+}
+
+
+// Implementation of NewTypeArrayStub
+
+// Implementation of NewTypeArrayStub
+
+NewTypeArrayStub::NewTypeArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _length = length;
+  _result = result;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewTypeArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  assert(_length->as_register() == r6, "length must in r6,");
+  assert(_klass_reg->as_register() == r3, "klass_reg must in r3");
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::new_type_array_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0");
+  __ b(_continuation);
+}
+
+
+// Implementation of NewObjectArrayStub
+
+NewObjectArrayStub::NewObjectArrayStub(LIR_Opr klass_reg, LIR_Opr length, LIR_Opr result, CodeEmitInfo* info) {
+  _klass_reg = klass_reg;
+  _result = result;
+  _length = length;
+  _info = new CodeEmitInfo(info);
+}
+
+
+void NewObjectArrayStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  assert(_length->as_register() == r6, "length must in r6");
+  assert(_klass_reg->as_register() == r3, "klass_reg must in r3");
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::new_object_array_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  assert(_result->as_register() == r0, "result must in r0");
+  __ b(_continuation);
+}
+// Implementation of MonitorAccessStubs
+
+MonitorEnterStub::MonitorEnterStub(LIR_Opr obj_reg, LIR_Opr lock_reg, CodeEmitInfo* info)
+: MonitorAccessStub(obj_reg, lock_reg)
+{
+  _info = new CodeEmitInfo(info);
+}
+
+
+void MonitorEnterStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+  __ bind(_entry);
+  ce->store_parameter(_obj_reg->as_register(),  1);
+  ce->store_parameter(_lock_reg->as_register(), 0);
+  Runtime1::StubID enter_id;
+  if (ce->compilation()->has_fpu_code()) {
+    enter_id = Runtime1::monitorenter_id;
+  } else {
+    enter_id = Runtime1::monitorenter_nofpu_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(enter_id)));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  __ b(_continuation);
+}
+
+
+void MonitorExitStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  if (_compute_lock) {
+    // lock_reg was destroyed by fast unlocking attempt => recompute it
+    ce->monitor_address(_monitor_ix, _lock_reg);
+  }
+  ce->store_parameter(_lock_reg->as_register(), 0);
+  // note: non-blocking leaf routine => no call info needed
+  Runtime1::StubID exit_id;
+  if (ce->compilation()->has_fpu_code()) {
+    exit_id = Runtime1::monitorexit_id;
+  } else {
+    exit_id = Runtime1::monitorexit_nofpu_id;
+  }
+  __ adr(lr, _continuation);
+  __ far_jump(RuntimeAddress(Runtime1::entry_for(exit_id)));
+}
+
+
+// Implementation of patching:
+// - Copy the code at given offset to an inlined buffer (first the bytes, then the number of bytes)
+// - Replace original code with a call to the stub
+// At Runtime:
+// - call to stub, jump to runtime
+// - in runtime: preserve all registers (rspecially objects, i.e., source and destination object)
+// - in runtime: after initializing class, restore original code, reexecute instruction
+
+int PatchingStub::_patch_info_offset = 0;
+
+void PatchingStub::align_patch_site(MacroAssembler* masm) {
+}
+
+void PatchingStub::emit_code(LIR_Assembler* ce) {
+  // NativeCall::instruction_size is dynamically calculated based on CPU,
+  // armv7 -> 3 instructions, armv6 -> 5 instructions. Initialize _patch_info_offset
+  // here, when CPU is determined already.
+  if (!_patch_info_offset)
+    _patch_info_offset = -NativeCall::instruction_size;
+  assert(_patch_info_offset == -NativeCall::instruction_size, "must not change");
+  assert(NativeCall::instruction_size <= _bytes_to_copy && _bytes_to_copy <= 0xFF, "not enough room for call");
+
+  Label call_patch;
+
+  // static field accesses have special semantics while the class
+  // initializer is being run so we emit a test which can be used to
+  // check that this code is being executed by the initializing
+  // thread.
+  address being_initialized_entry = __ pc();
+  if (CommentedAssembly) {
+    __ block_comment(" patch template");
+  }
+  address start = __ pc();
+  if (_id == load_klass_id) {
+    // produce a copy of the load klass instruction for use by the being initialized case
+    int metadata_index = -1;
+    CodeSection* cs = __ code_section();
+    RelocIterator iter(cs, (address)_pc_start, (address)_pc_start+1);
+    while (iter.next()) {
+      if (iter.type() == relocInfo::metadata_type) {
+        metadata_Relocation* r = iter.metadata_reloc();
+        assert(metadata_index == -1, "uninitalized yet");
+        metadata_index = r->metadata_index();
+        break;
+      }
+    }
+    assert(metadata_index != -1, "initialized");
+    __ relocate(metadata_Relocation::spec(metadata_index));
+    __ patchable_load(_obj, __ pc());
+    while ((intx) __ pc() - (intx) start < NativeCall::instruction_size) {
+      __ nop();
+    }
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      assert(*(_pc_start + i) == *(start + i), "should be the same code");
+    }
+#endif
+  } else if (_id == load_mirror_id || _id == load_appendix_id) {
+    // produce a copy of the load mirror instruction for use by the being
+    // initialized case
+    int oop_index = -1;
+    CodeSection* cs = __ code_section();
+    RelocIterator iter(cs, (address)_pc_start, (address)_pc_start+1);
+    while (iter.next()) {
+      if (iter.type() == relocInfo::oop_type) {
+        oop_Relocation* r = iter.oop_reloc();
+        assert(oop_index == -1, "uninitalized yet");
+        oop_index = r->oop_index();
+        break;
+      }
+    }
+    assert(oop_index != -1, "initialized");
+    __ relocate(oop_Relocation::spec(oop_index));
+    __ patchable_load(_obj, __ pc());
+    while ((intx) __ pc() - (intx) start < NativeCall::instruction_size) {
+      __ nop();
+    }
+#ifdef ASSERT
+    for (int i = 0; i < _bytes_to_copy; i++) {
+      assert(*(_pc_start + i) == *(start + i), "should be the same code");
+    }
+#endif
+  } else if (_id == access_field_id) {
+    // make a copy the code which is going to be patched.
+    address const_addr = (address) -1;
+    CodeSection* cs = __ code_section();
+    RelocIterator iter(cs, (address)_pc_start, (address)_pc_start+1);
+    while (iter.next()) {
+      if (iter.type() == relocInfo::section_word_type) {
+        section_word_Relocation* r = iter.section_word_reloc();
+        assert(const_addr == (address) -1, "uninitalized yet");
+        const_addr = r->target();
+        break;
+      }
+    }
+    assert(const_addr != (address) -1, "initialized");
+    __ relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
+    __ patchable_load(rscratch1, const_addr);
+    while ((intx) __ pc() - (intx) start < NativeCall::instruction_size) {
+      __ nop();
+    }
+#ifdef ASSERT
+    intptr_t* from = (intptr_t*) start;
+    intptr_t* to = (intptr_t*) _pc_start;
+    assert(from[0] == to[0], "should be same (nop)");
+    assert(from[1] == to[1], "should be same (barrier)");
+    //TODO: update
+    //XXX: update nativeInst_aarch32..?
+    #if 0
+    assert(NativeFarLdr::from((address) (from + 2))->data_addr()
+        == NativeFarLdr::from((address) (to + 2))->data_addr(),
+        "should load from one addr)");
+#endif
+    for (int i = 4 * NativeInstruction::arm_insn_sz; i < _bytes_to_copy; i++) {
+      assert(*(_pc_start + i) == *(start + i), "should be the same code");
+    }
+#endif
+  } else {
+    ShouldNotReachHere();
+  }
+
+  int bytes_to_skip = _bytes_to_copy;
+
+  if (_id == load_mirror_id) {
+    int offset = __ offset();
+    if (CommentedAssembly) {
+      __ block_comment(" being_initialized check");
+    }
+    assert(_obj != noreg, "must be a valid register");
+    // Load without verification to keep code size small. We need it because
+    // begin_initialized_entry_offset has to fit in a byte. Also, we know it's not null.
+    __ ldr(rscratch1, Address(_obj, java_lang_Class::klass_offset_in_bytes()));
+    __ ldr(rscratch1, Address(rscratch1, InstanceKlass::init_thread_offset()));
+    __ cmp(rthread, rscratch1);
+    __ b(call_patch, Assembler::NE);
+
+    // access_field patches may execute the patched code before it's
+    // copied back into place so we need to jump back into the main
+    // code of the nmethod to continue execution.
+    __ b(_patch_site_continuation);
+    // make sure this extra code gets skipped
+    bytes_to_skip += __ offset() - offset;
+  }
+
+  // Now emit the patch record telling the runtime how to find the
+  // pieces of the patch.  We only need 3 bytes but it has to be
+  // aligned as an instruction so emit 4 bytes.
+  int sizeof_patch_record = 4;
+  bytes_to_skip += sizeof_patch_record;
+
+  // emit the offsets needed to find the code to patch
+  int being_initialized_entry_offset = __ pc() - being_initialized_entry + sizeof_patch_record;
+
+  __ emit_int8(0);
+  __ emit_int8(being_initialized_entry_offset);
+  __ emit_int8(bytes_to_skip);
+  __ emit_int8(0);
+
+  address patch_info_pc = __ pc();
+
+  address entry = __ pc();
+  NativeGeneralJump::insert_unconditional((address)_pc_start, entry);
+  address target = NULL;
+  relocInfo::relocType reloc_type = relocInfo::none;
+  switch (_id) {
+    case access_field_id:  target = Runtime1::entry_for(Runtime1::access_field_patching_id); reloc_type = relocInfo::section_word_type; break;
+    case load_klass_id:    target = Runtime1::entry_for(Runtime1::load_klass_patching_id); reloc_type = relocInfo::metadata_type; break;
+    case load_mirror_id:   target = Runtime1::entry_for(Runtime1::load_mirror_patching_id); reloc_type = relocInfo::oop_type; break;
+    case load_appendix_id: target = Runtime1::entry_for(Runtime1::load_appendix_patching_id); reloc_type = relocInfo::oop_type; break;
+    default: ShouldNotReachHere();
+  }
+  __ bind(call_patch);
+
+  if (CommentedAssembly) {
+    __ block_comment("patch entry point");
+  }
+  __ mov(rscratch1, RuntimeAddress(target));
+  __ bl(rscratch1);
+  // pad with nops to globally known upper bound of patch site size
+  while (patch_info_pc - __ pc() < _patch_info_offset)
+    __ nop();
+  assert(_patch_info_offset == (patch_info_pc - __ pc()), "must not change, required by shared code");
+  ce->add_call_info_here(_info);
+  int jmp_off = __ offset();
+  __ b(_patch_site_entry);
+  // Add enough nops so deoptimization can overwrite the jmp above with a call
+  // and not destroy the world.
+  for (int j = __ offset() ; j < jmp_off + NativeCall::instruction_size; j += NativeInstruction::arm_insn_sz) {
+    __ nop();
+  }
+
+  CodeSection* cs = __ code_section();
+  RelocIterator iter(cs, (address)_pc_start, (address)_pc_start+1);
+  relocInfo::change_reloc_info_for_address(&iter, (address)_pc_start, reloc_type, relocInfo::none);
+}
+
+
+void DeoptimizeStub::emit_code(LIR_Assembler* ce) {
+  __ bind(_entry);
+  ce->store_parameter(_trap_request, 0);
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::deoptimize_id)));
+  ce->add_call_info_here(_info);
+  DEBUG_ONLY(__ should_not_reach_here());
+}
+
+
+void ImplicitNullCheckStub::emit_code(LIR_Assembler* ce) {
+  address a;
+  if (_info->deoptimize_on_exception()) {
+    // Deoptimize, do not throw the exception, because it is probably wrong to do it here.
+    a = Runtime1::entry_for(Runtime1::predicate_failed_trap_id);
+  } else {
+    a = Runtime1::entry_for(Runtime1::throw_null_pointer_exception_id);
+  }
+
+  ce->compilation()->implicit_exception_table()->append(_offset, __ offset());
+  __ bind(_entry);
+  __ far_call(RuntimeAddress(a));
+  ce->add_call_info_here(_info);
+  ce->verify_oop_map(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+
+void SimpleExceptionStub::emit_code(LIR_Assembler* ce) {
+  assert(__ rsp_offset() == 0, "frame size should be fixed");
+
+  __ bind(_entry);
+  // pass the object in a scratch register because all other registers
+  // must be preserved
+  if (_obj->is_cpu_register()) {
+    __ mov(rscratch1, _obj->as_register());
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(_stub)), NULL);
+  ce->add_call_info_here(_info);
+  debug_only(__ should_not_reach_here());
+}
+
+
+void ArrayCopyStub::emit_code(LIR_Assembler* ce) {
+  //---------------slow case: call to native-----------------
+  __ bind(_entry);
+  // Figure out where the args should go
+  // This should really convert the IntrinsicID to the Method* and signature
+  // but I don't know how to do that.
+  //
+  VMRegPair args[5];
+  BasicType signature[5] = { T_OBJECT, T_INT, T_OBJECT, T_INT, T_INT};
+  SharedRuntime::java_calling_convention(signature, args, 5, true);
+
+  // push parameters
+  // (src, src_pos, dest, destPos, length)
+  Register r[5];
+  r[0] = src()->as_register();
+  r[1] = src_pos()->as_register();
+  r[2] = dst()->as_register();
+  r[3] = dst_pos()->as_register();
+  r[4] = length()->as_register();
+
+  // next registers will get stored on the stack
+  for (int i = 0; i < 5 ; i++ ) {
+    VMReg r_1 = args[i].first();
+    if (r_1->is_stack()) {
+      int st_off = r_1->reg2stack() * wordSize;
+      __ str (r[i], Address(sp, st_off));
+    } else {
+      assert(r[i] == args[i].first()->as_Register(), "Wrong register for arg ");
+    }
+  }
+
+  ce->align_call(lir_static_call);
+
+  ce->emit_static_call_stub();
+  Address resolve(SharedRuntime::get_resolve_static_call_stub(),
+                  relocInfo::static_call_type);
+  __ trampoline_call(resolve);
+  ce->add_call_info_here(info());
+
+#ifndef PRODUCT
+  __ lea(rscratch2, ExternalAddress((address)&Runtime1::_arraycopy_slowcase_cnt));
+  __ increment(Address(rscratch2));
+#endif
+
+  __ b(_continuation);
+}
+
+#undef __
--- /dev/null	2018-09-25 19:24:17.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_Defs_aarch32.hpp	2018-09-25 19:24:17.000000000 +0300
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
+
+// Native word offsets from memory address (little endian format)
+enum {
+  pd_lo_word_offset_in_bytes = 0,
+  pd_hi_word_offset_in_bytes = BytesPerWord
+};
+
+// TODO: We should understand what values are correct for the following 3 flags
+// relevant to floating point operations:
+// - UseSSE
+//   Highest supported SSE instruction set on x86/x64. I believe we should
+//   set it to 0 in VM_Version::initialize(), like other non-x86 ports do.
+// - RoundFPResults
+//   Indicates whether rounding is needed for floating point results
+// - pd_strict_fp_requires_explicit_rounding
+//   The same as above but for the strictfp mode
+
+// Explicit rounding operations are not required to implement the strictfp mode
+enum {
+  pd_strict_fp_requires_explicit_rounding = false
+};
+
+// Registers
+enum {
+  // Number of registers used during code emission
+  pd_nof_cpu_regs_frame_map = RegisterImpl::number_of_registers,
+  pd_nof_fpu_regs_frame_map = FloatRegisterImpl::number_of_registers,
+
+  // Number of registers killed by calls
+  pd_nof_caller_save_cpu_regs_frame_map = 9,
+
+  pd_nof_caller_save_fpu_regs_frame_map = pd_nof_fpu_regs_frame_map,
+  // The following two constants need to be defined since they are referenced
+  // from c1_FrameMap.hpp, but actually they are never used, so can be set to
+  // arbitrary values.
+  pd_nof_cpu_regs_reg_alloc = -1,
+  pd_nof_fpu_regs_reg_alloc = -1,
+
+  // All the constants below are used by linear scan register allocator only.
+  // Number of registers visible to register allocator
+  pd_nof_cpu_regs_linearscan = pd_nof_cpu_regs_frame_map,
+  pd_nof_fpu_regs_linearscan = pd_nof_fpu_regs_frame_map,
+  pd_nof_xmm_regs_linearscan = 0,
+
+  // Register allocator specific register numbers corresponding to first/last
+  // CPU/FPU registers available for allocation
+  pd_first_cpu_reg = 0,
+  pd_last_cpu_reg = 8,
+  pd_first_fpu_reg = pd_nof_cpu_regs_frame_map,
+  pd_last_fpu_reg = pd_first_fpu_reg + pd_nof_fpu_regs_frame_map - 1,
+  // Register allocator specific register numbers corresponding to first/last
+  // CPU/FPU callee-saved registers. These constants are used in
+  // LinearScan::is_caller_save() only.
+  pd_first_callee_saved_cpu_reg = 4,
+  pd_last_callee_saved_cpu_reg = 11,
+  pd_first_callee_saved_fpu_reg = pd_first_fpu_reg + pd_nof_fpu_regs_frame_map/2,
+  pd_last_callee_saved_fpu_reg = pd_first_fpu_reg + pd_nof_fpu_regs_frame_map - 1
+};
+
+// This flag must be in sync with how the floating point registers are stored
+// on the stack by RegisterSaver::save_live_registers() method
+// (sharedRuntime_aarch32.cpp) and save_live_registers() function
+// (c1_Runtime1_aarch32.cpp). On AArch32 the floating point registers keep
+// floats and doubles in their native form. No float to double conversion
+// happens when the registers are stored on the stack. This is opposite to
+// what happens on x86, where the FPU stack registers are 80 bits wide,
+// and storing them in either 4 byte or 8 byte stack slot is a conversion
+// operation.
+enum {
+  pd_float_saved_as_double = false
+};
+
+#endif // CPU_AARCH32_VM_C1_DEFS_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:19.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_FpuStackSim_aarch32.cpp	2018-09-25 19:24:18.000000000 +0300
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_FpuStackSim.hpp"
+
+// No FPU stack on AArch32
--- /dev/null	2018-09-25 19:24:20.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_FpuStackSim_aarch32.hpp	2018-09-25 19:24:20.000000000 +0300
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
+
+// No FPU stack on AArch32
+
+#endif // CPU_AARCH32_VM_C1_FPUSTACKSIM_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:21.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_FrameMap_aarch32.cpp	2018-09-25 19:24:21.000000000 +0300
@@ -0,0 +1,257 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_LIR.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+LIR_Opr FrameMap::r0_opr;
+LIR_Opr FrameMap::r1_opr;
+LIR_Opr FrameMap::r2_opr;
+LIR_Opr FrameMap::r3_opr;
+LIR_Opr FrameMap::r4_opr;
+LIR_Opr FrameMap::r5_opr;
+LIR_Opr FrameMap::r6_opr;
+LIR_Opr FrameMap::r7_opr;
+LIR_Opr FrameMap::r8_opr;
+LIR_Opr FrameMap::r9_opr;
+LIR_Opr FrameMap::r10_opr;
+LIR_Opr FrameMap::r11_opr;
+LIR_Opr FrameMap::r12_opr;
+LIR_Opr FrameMap::r13_opr;
+LIR_Opr FrameMap::r14_opr;
+LIR_Opr FrameMap::r15_opr;
+
+LIR_Opr FrameMap::r0_oop_opr;
+LIR_Opr FrameMap::r1_oop_opr;
+LIR_Opr FrameMap::r2_oop_opr;
+LIR_Opr FrameMap::r3_oop_opr;
+LIR_Opr FrameMap::r4_oop_opr;
+LIR_Opr FrameMap::r5_oop_opr;
+LIR_Opr FrameMap::r6_oop_opr;
+LIR_Opr FrameMap::r7_oop_opr;
+LIR_Opr FrameMap::r8_oop_opr;
+LIR_Opr FrameMap::r9_oop_opr;
+LIR_Opr FrameMap::r10_oop_opr;
+LIR_Opr FrameMap::r11_oop_opr;
+LIR_Opr FrameMap::r12_oop_opr;
+LIR_Opr FrameMap::r13_oop_opr;
+LIR_Opr FrameMap::r14_oop_opr;
+LIR_Opr FrameMap::r15_oop_opr;
+
+LIR_Opr FrameMap::r0_metadata_opr;
+LIR_Opr FrameMap::r1_metadata_opr;
+LIR_Opr FrameMap::r2_metadata_opr;
+LIR_Opr FrameMap::r3_metadata_opr;
+LIR_Opr FrameMap::r4_metadata_opr;
+LIR_Opr FrameMap::r5_metadata_opr;
+
+LIR_Opr FrameMap::sp_opr;
+LIR_Opr FrameMap::receiver_opr;
+
+LIR_Opr FrameMap::rscratch1_opr;
+LIR_Opr FrameMap::rscratch2_opr;
+LIR_Opr FrameMap::rscratch_long_opr;
+
+LIR_Opr FrameMap::long0_opr;
+LIR_Opr FrameMap::long1_opr;
+LIR_Opr FrameMap::long2_opr;
+LIR_Opr FrameMap::fpu0_float_opr;
+LIR_Opr FrameMap::fpu0_double_opr;
+
+LIR_Opr FrameMap::_caller_save_cpu_regs[] = { 0, };
+LIR_Opr FrameMap::_caller_save_fpu_regs[] = { 0, };
+
+void FrameMap::initialize() {
+  assert(!_init_done, "must be called once");
+
+  int i = 0;
+  map_register(i, r0); r0_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r1); r1_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r2); r2_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r3); r3_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r4); r4_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r5); r5_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r6); r6_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r7); r7_opr = LIR_OprFact::single_cpu(i); i++;
+  map_register(i, r8); r8_opr = LIR_OprFact::single_cpu(i); i++;
+  // Mapping lines in this block may be arbitrarily mixed, but all allocatable
+  // registers should go above this comment, and unallocatable registers -
+  // below.
+  map_register(i, r9); r9_opr = LIR_OprFact::single_cpu(i); i++;   // rscratch1
+  map_register(i, r10); r10_opr = LIR_OprFact::single_cpu(i); i++; // rthread
+  map_register(i, r11); r11_opr = LIR_OprFact::single_cpu(i); i++; // rfp
+  map_register(i, r12); r12_opr = LIR_OprFact::single_cpu(i); i++; // rscratch2
+  map_register(i, r13); r13_opr = LIR_OprFact::single_cpu(i); i++; // sp
+  map_register(i, r14); r14_opr = LIR_OprFact::single_cpu(i); i++; // lr
+  map_register(i, r15); r15_opr = LIR_OprFact::single_cpu(i); i++; // r15_pc
+
+  // This flag must be set after all integer registers are mapped but before
+  // the first use of as_*_opr() methods.
+  _init_done = true;
+
+  r0_oop_opr = as_oop_opr(r0);
+  r1_oop_opr = as_oop_opr(r1);
+  r2_oop_opr = as_oop_opr(r2);
+  r3_oop_opr = as_oop_opr(r3);
+  r4_oop_opr = as_oop_opr(r4);
+  r5_oop_opr = as_oop_opr(r5);
+  r6_oop_opr = as_oop_opr(r6);
+  r7_oop_opr = as_oop_opr(r7);
+  r8_oop_opr = as_oop_opr(r8);
+  r9_oop_opr = as_oop_opr(r9);
+  r10_oop_opr = as_oop_opr(r10);
+  r11_oop_opr = as_oop_opr(r11);
+  r12_oop_opr = as_oop_opr(r12);
+  r13_oop_opr = as_oop_opr(r13);
+  r14_oop_opr = as_oop_opr(r14);
+  r15_oop_opr = as_oop_opr(r15);
+
+  r0_metadata_opr = as_metadata_opr(r0);
+  r1_metadata_opr = as_metadata_opr(r1);
+  r2_metadata_opr = as_metadata_opr(r2);
+  r3_metadata_opr = as_metadata_opr(r3);
+  r4_metadata_opr = as_metadata_opr(r4);
+  r5_metadata_opr = as_metadata_opr(r5);
+
+  sp_opr = as_pointer_opr(sp);
+
+  VMRegPair regs;
+  BasicType sig_bt = T_OBJECT;
+  SharedRuntime::java_calling_convention(&sig_bt, &regs, 1, true);
+  receiver_opr = as_oop_opr(regs.first()->as_Register());
+
+  rscratch1_opr = as_opr(rscratch1);
+  rscratch2_opr = as_opr(rscratch2);
+  rscratch_long_opr = as_long_opr(rscratch1, rscratch2);
+
+  long0_opr = as_long_opr(r0, r1);
+  long1_opr = as_long_opr(r2, r3);
+  long2_opr = as_long_opr(r4, r5);
+  fpu0_float_opr = LIR_OprFact::single_fpu(0);
+  fpu0_double_opr = LIR_OprFact::double_fpu(0, 1);
+
+  _caller_save_cpu_regs[0] = r0_opr;
+  _caller_save_cpu_regs[1] = r1_opr;
+  _caller_save_cpu_regs[2] = r2_opr;
+  _caller_save_cpu_regs[3] = r3_opr;
+  _caller_save_cpu_regs[4] = r4_opr;
+  _caller_save_cpu_regs[5] = r5_opr;
+  _caller_save_cpu_regs[6] = r6_opr;
+  _caller_save_cpu_regs[7] = r7_opr;
+  _caller_save_cpu_regs[8] = r8_opr;
+
+  for (i = 0; i < nof_caller_save_fpu_regs; i++) {
+    _caller_save_fpu_regs[i] = LIR_OprFact::single_fpu(i);
+  }
+}
+
+LIR_Opr FrameMap::stack_pointer() {
+  return sp_opr;
+}
+
+// TODO: Make sure that neither method handle intrinsics nor compiled lambda
+// forms modify sp register (i.e., vmIntrinsics::{_invokeBasic, _linkToVirtual,
+// _linkToStatic, _linkToSpecial, _linkToInterface, _compiledLambdaForm})
+LIR_Opr FrameMap::method_handle_invoke_SP_save_opr() {
+  return LIR_OprFact::illegalOpr;
+}
+
+// Return LIR_Opr corresponding to the given VMRegPair and data type
+LIR_Opr FrameMap::map_to_opr(BasicType type, VMRegPair* reg, bool) {
+  LIR_Opr opr = LIR_OprFact::illegalOpr;
+  VMReg r_1 = reg->first();
+  VMReg r_2 = reg->second();
+  if (r_1->is_stack()) {
+    // Convert stack slot to sp-based address. The calling convention does not
+    // count the SharedRuntime::out_preserve_stack_slots() value, so we must
+    // add it in here.
+    int st_off =
+        (r_1->reg2stack() + SharedRuntime::out_preserve_stack_slots()) *
+        VMRegImpl::stack_slot_size;
+    opr = LIR_OprFact::address(new LIR_Address(sp_opr, st_off, type));
+  } else if (r_1->is_Register()) {
+    Register reg1 = r_1->as_Register();
+#ifdef HARD_FLOAT_CC
+    if (type == T_DOUBLE || type == T_FLOAT) {
+        ShouldNotReachHere();
+    } else
+#endif
+    if (type == T_LONG || type == T_DOUBLE) {
+      assert(r_2->is_Register(), "wrong VMReg");
+      Register reg2 = r_2->as_Register();
+      opr = as_long_opr(reg1, reg2);
+    } else if (type == T_OBJECT || type == T_ARRAY) {
+      opr = as_oop_opr(reg1);
+    } else if (type == T_METADATA) {
+      opr = as_metadata_opr(reg1);
+    } else {
+      opr = as_opr(reg1);
+    }
+  } else if (r_1->is_FloatRegister()) {
+    int num = r_1->as_FloatRegister()->encoding();
+    if (type == T_FLOAT) {
+      opr = LIR_OprFact::single_fpu(num);
+    } else {
+      assert(is_even(num) && r_2->as_FloatRegister()->encoding() == (num + 1),
+             "wrong VMReg");
+      opr = LIR_OprFact::double_fpu(num, num + 1);
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  return opr;
+}
+
+// Return VMReg corresponding to the given FPU register number as it is
+// encoded in LIR_Opr. The conversion is straightforward because in this
+// implementation the encoding of FPU registers in LIR_Opr's is the same as
+// in FloatRegister's.
+VMReg FrameMap::fpu_regname(int n) {
+  return as_FloatRegister(n)->as_VMReg();
+}
+
+// Check that the frame is properly addressable on the platform. The sp-based
+// address of every frame slot must have the offset expressible as AArch32's
+// imm12 with the separately stored sign.
+bool FrameMap::validate_frame() {
+  int max_offset = in_bytes(framesize_in_bytes());
+  int java_index = 0;
+  for (int i = 0; i < _incoming_arguments->length(); i++) {
+    LIR_Opr opr = _incoming_arguments->at(i);
+    if (opr->is_stack()) {
+      max_offset = MAX2(_argument_locations->at(java_index), max_offset);
+    }
+    java_index += type2size[opr->type()];
+  }
+  return Assembler::is_valid_for_offset_imm(max_offset, 12);
+}
+
+Address FrameMap::make_new_address(ByteSize sp_offset) const {
+  return Address(sp, in_bytes(sp_offset));
+}
--- /dev/null	2018-09-25 19:24:22.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_FrameMap_aarch32.hpp	2018-09-25 19:24:22.000000000 +0300
@@ -0,0 +1,169 @@
+/*
+ * Copyright (c) 1999, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
+
+// The following schema visualizes how a C1 frame looks like on AArch32.
+// It corresponds to the case of an unextended frame. Each line of text
+// represents one 4-byte slot. Every monitor takes two slots. Positions of
+// incoming arguments are determined by the Java calling convention. Spill
+// area and monitor area are not required to be 8-byte aligned. The slot
+// for deoptimization support is used by frame::deoptimize() method to save
+// the original pc before patching in the new one.
+//
+// When LIR_Opr's reference stack slots, they use virtual stack slot indices.
+// They are mapped to the real stack slots by FrameMap::sp_offset_for_slot()
+// and FrameMap::sp_offset_for_double_slot() methods. The first _argcount
+// virtual stack slots correspond to the real stack slots occupied by the
+// incoming arguments. Their mapping is defined by _argument_locations array
+// (which is filled in by applying the Java calling convention). All other
+// virtual stack slots correspond to spill slots.
+//
+// Higher addresses
+//                  |              incoming              |      virtual stack slots
+//                  |                                    |      [0 ... _arg_count - 1]
+//                  |             arguments              |
+//                  |====================================|----X- 8-byte aligned
+//                  |            previous lr             |   /|\ address
+//         rfp ===> |------------------------------------|    |
+//                  |            previous rfp            |    |
+//                  |====================================|    |
+//                  |     alignment slot (if needed)     |    |
+//                  |====================================|    |
+//                  |  slot for deoptimization support   |    |
+//                  |====================================|    |
+//                  | monitor [_num_monitors - 1] object |    |
+//                  |                                    |    |
+//                  |  monitor [_num_monitors - 1] lock  |    |
+//                  |------------------------------------|    |
+//                  |                                    |    |
+// Direction of     |                ...                 |    | _framesize
+// stack growth     |                                    |    | slots
+//      |           |------------------------------------|    |
+//      V           |         monitor [0] object         |    |
+//                  |                                    |    |
+//                  |          monitor [0] lock          |    |
+//                  |====================================|    |
+//                  |    spill slot [_num_spills - 1]    |    | virtual stack slot
+//                  |------------------------------------|    | [_arg_count + _num_spills - 1]
+//                  |                ...                 |    | ...
+//                  |------------------------------------|    |
+//                  |           spill slot [0]           |    | virtual stack slot
+//                  |====================================|    | [_arg_count]
+//                  |     reserved argument area for     |    |
+//                  |                ...                 |    |
+//                  |  outgoing calls (8-byte aligned)   |   \|/
+//          sp ===> |====================================|----X- 8-byte aligned
+//                  |                                    |       address
+// Lower addresses
+
+ public:
+  enum {
+    first_available_sp_in_frame = 0,
+    max_frame_pad = 16, // max value that frame::get_frame_size() may return
+    frame_pad_in_bytes = max_frame_pad
+  };
+
+ public:
+  static LIR_Opr r0_opr;
+  static LIR_Opr r1_opr;
+  static LIR_Opr r2_opr;
+  static LIR_Opr r3_opr;
+  static LIR_Opr r4_opr;
+  static LIR_Opr r5_opr;
+  static LIR_Opr r6_opr;
+  static LIR_Opr r7_opr;
+  static LIR_Opr r8_opr;
+  static LIR_Opr r9_opr;
+  static LIR_Opr r10_opr;
+  static LIR_Opr r11_opr;
+  static LIR_Opr r12_opr;
+  static LIR_Opr r13_opr;
+  static LIR_Opr r14_opr;
+  static LIR_Opr r15_opr;
+
+  static LIR_Opr r0_oop_opr;
+  static LIR_Opr r1_oop_opr;
+  static LIR_Opr r2_oop_opr;
+  static LIR_Opr r3_oop_opr;
+  static LIR_Opr r4_oop_opr;
+  static LIR_Opr r5_oop_opr;
+  static LIR_Opr r6_oop_opr;
+  static LIR_Opr r7_oop_opr;
+  static LIR_Opr r8_oop_opr;
+  static LIR_Opr r9_oop_opr;
+  static LIR_Opr r10_oop_opr;
+  static LIR_Opr r11_oop_opr;
+  static LIR_Opr r12_oop_opr;
+  static LIR_Opr r13_oop_opr;
+  static LIR_Opr r14_oop_opr;
+  static LIR_Opr r15_oop_opr;
+
+  static LIR_Opr r0_metadata_opr;
+  static LIR_Opr r1_metadata_opr;
+  static LIR_Opr r2_metadata_opr;
+  static LIR_Opr r3_metadata_opr;
+  static LIR_Opr r4_metadata_opr;
+  static LIR_Opr r5_metadata_opr;
+
+  static LIR_Opr sp_opr;
+  static LIR_Opr receiver_opr;
+
+  static LIR_Opr rscratch1_opr;
+  static LIR_Opr rscratch2_opr;
+  static LIR_Opr rscratch_long_opr;
+
+  static LIR_Opr long0_opr;
+  static LIR_Opr long1_opr;
+  static LIR_Opr long2_opr;
+  static LIR_Opr fpu0_float_opr;
+  static LIR_Opr fpu0_double_opr;
+
+  static LIR_Opr as_long_opr(Register r1, Register r2) {
+    return LIR_OprFact::double_cpu(cpu_reg2rnr(r1), cpu_reg2rnr(r2));
+  }
+  static LIR_Opr as_pointer_opr(Register r) {
+    return LIR_OprFact::single_cpu(cpu_reg2rnr(r));
+  }
+
+  static VMReg fpu_regname(int n);
+
+  static bool is_caller_save_register(LIR_Opr opr) {
+    // On AArch32, unlike on SPARC, we never explicitly request the C1 register
+    // allocator to allocate a callee-saved register. Since the only place this
+    // method is called is the assert in LinearScan::color_lir_opr(), we can
+    // safely just always return true here.
+    return true;
+  }
+  static int nof_caller_save_cpu_regs() {
+    return pd_nof_caller_save_cpu_regs_frame_map;
+  }
+  static int last_cpu_reg() {
+    return pd_last_cpu_reg;
+  }
+
+#endif // CPU_AARCH32_VM_C1_FRAMEMAP_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:23.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LIRAssembler_aarch32.cpp	2018-09-25 19:24:23.000000000 +0300
@@ -0,0 +1,3280 @@
+/*
+ * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "asm/assembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArrayKlass.hpp"
+#include "ci/ciInstance.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/cardTableBarrierSet.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+#include "register_aarch32.hpp"
+
+#ifndef PRODUCT
+#define COMMENT(x)   do { __ block_comment(x); } while (0)
+#else
+#define COMMENT(x)
+#endif
+
+NEEDS_CLEANUP // remove this definitions ?
+const Register IC_Klass    = rscratch2;   // where the IC klass is cached
+const Register SYNC_header = r0;   // synchronization header
+const Register SHIFT_count = r0;   // where count for shift operations must be
+
+#define __ _masm->
+
+
+static void select_different_registers(Register preserve,
+                                       Register extra,
+                                       Register &tmp1,
+                                       Register &tmp2) {
+  if (tmp1 == preserve) {
+    assert_different_registers(tmp1, tmp2, extra);
+    tmp1 = extra;
+  } else if (tmp2 == preserve) {
+    assert_different_registers(tmp1, tmp2, extra);
+    tmp2 = extra;
+  }
+  assert_different_registers(preserve, tmp1, tmp2);
+}
+
+
+
+static void select_different_registers(Register preserve,
+                                       Register extra,
+                                       Register &tmp1,
+                                       Register &tmp2,
+                                       Register &tmp3) {
+  if (tmp1 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp1 = extra;
+  } else if (tmp2 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp2 = extra;
+  } else if (tmp3 == preserve) {
+    assert_different_registers(tmp1, tmp2, tmp3, extra);
+    tmp3 = extra;
+  }
+  assert_different_registers(preserve, tmp1, tmp2, tmp3);
+}
+
+bool LIR_Assembler::is_small_constant(LIR_Opr opr) { Unimplemented(); return false; }
+
+
+LIR_Opr LIR_Assembler::receiverOpr() {
+  return FrameMap::receiver_opr;
+}
+
+LIR_Opr LIR_Assembler::osrBufferPointer() {
+  return FrameMap::as_pointer_opr(receiverOpr()->as_register());
+}
+
+//--------------fpu register translations-----------------------
+
+
+address LIR_Assembler::float_constant(float f) {
+  address const_addr = __ float_constant(f);
+  if (const_addr == NULL) {
+    bailout("const section overflow");
+    return __ code()->consts()->start();
+  } else {
+    return const_addr;
+  }
+}
+
+
+address LIR_Assembler::double_constant(double d) {
+  address const_addr = __ double_constant(d);
+  if (const_addr == NULL) {
+    bailout("const section overflow");
+    return __ code()->consts()->start();
+  } else {
+    return const_addr;
+  }
+}
+
+void LIR_Assembler::set_24bit_FPU() { Unimplemented(); }
+
+void LIR_Assembler::reset_FPU() { Unimplemented(); }
+
+void LIR_Assembler::fpop() { Unimplemented(); }
+
+void LIR_Assembler::fxch(int i) { Unimplemented(); }
+
+void LIR_Assembler::fld(int i) { Unimplemented(); }
+
+void LIR_Assembler::ffree(int i) { Unimplemented(); }
+
+void LIR_Assembler::breakpoint() { __ bkpt(0); }
+
+void LIR_Assembler::push(LIR_Opr opr) { Unimplemented(); }
+
+void LIR_Assembler::pop(LIR_Opr opr) { Unimplemented(); }
+
+//-------------------------------------------
+
+static Register as_reg(LIR_Opr op) {
+  return op->is_double_cpu() ? op->as_register_lo() : op->as_register();
+}
+
+Address LIR_Assembler::as_Address(LIR_Address* addr) {
+  // as_Address(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_hi(LIR_Address* addr) {
+  // as_Address_hi(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_lo(LIR_Address* addr) {
+  // as_Address_lo(LIR_Address*, Address::InsnDataType) should be used instead
+  ShouldNotCallThis();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address(LIR_Address* addr, Register tmp, Address::InsnDataType type) {
+  if (addr->base()->is_illegal()) {
+    assert(addr->index()->is_illegal(), "must be illegal too");
+    __ mov(tmp, addr->disp());
+    return Address(tmp); // encoding is ok for any data type
+  }
+
+  Register base = addr->base()->as_pointer_register();
+
+  if (addr->index()->is_illegal()) {
+    return Address(base, addr->disp()).safe_for(type, _masm, tmp);
+  } else if (addr->index()->is_cpu_register()) {
+    assert(addr->disp() == 0, "must be");
+    Register index = addr->index()->as_pointer_register();
+    return Address(base, index, lsl(addr->scale())).safe_for(type, _masm, tmp);
+  } else if (addr->index()->is_constant()) {
+    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp();
+    return Address(base, addr_offset).safe_for(type, _masm, tmp);
+  }
+
+  Unimplemented();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_hi(LIR_Address* addr, Address::InsnDataType type) {
+  assert(type == Address::IDT_INT, "only to be used for accessing high word of jlong");
+
+  if (addr->base()->is_illegal()) {
+    assert(addr->index()->is_illegal(), "must be illegal too");
+    __ mov(rscratch1, addr->disp() + wordSize);
+    return Address(rscratch1); // encoding is ok for IDR_INT
+  }
+
+  Register base = addr->base()->as_pointer_register();
+
+  if (addr->index()->is_illegal()) {
+    return Address(base, addr->disp() + wordSize).safe_for(Address::IDT_INT, _masm, rscratch1);
+  } else if (addr->index()->is_cpu_register()) {
+    assert(addr->disp() == 0, "must be");
+    Register index = addr->index()->as_pointer_register();
+    __ add(rscratch1, base, wordSize);
+    return Address(rscratch1, index, lsl(addr->scale())); // encoding is ok for IDT_INT
+  } else if (addr->index()->is_constant()) {
+    intptr_t addr_offset = (addr->index()->as_constant_ptr()->as_jint() << addr->scale()) + addr->disp() + wordSize;
+    return Address(base, addr_offset).safe_for(Address::IDT_INT, _masm, rscratch1);
+  }
+
+  Unimplemented();
+  return Address();
+}
+
+Address LIR_Assembler::as_Address_lo(LIR_Address* addr, Address::InsnDataType type) {
+  return as_Address(addr, rscratch1, type);
+}
+
+
+void LIR_Assembler::osr_entry() {
+  offsets()->set_value(CodeOffsets::OSR_Entry, code_offset());
+  BlockBegin* osr_entry = compilation()->hir()->osr_entry();
+  ValueStack* entry_state = osr_entry->state();
+  int number_of_locks = entry_state->locks_size();
+
+  // we jump here if osr happens with the interpreter
+  // state set up to continue at the beginning of the
+  // loop that triggered osr - in particular, we have
+  // the following registers setup:
+  //
+  // r1: osr buffer
+  //
+
+  // build frame
+  ciMethod* m = compilation()->method();
+  __ build_frame(initial_frame_size_in_bytes(), bang_size_in_bytes());
+
+  // OSR buffer is
+  //
+  // locals[nlocals-1..0]
+  // monitors[0..number_of_locks]
+  //
+  // locals is a direct copy of the interpreter frame so in the osr buffer
+  // so first slot in the local array is the last local from the interpreter
+  // and last slot is local[0] (receiver) from the interpreter
+  //
+  // Similarly with locks. The first lock slot in the osr buffer is the nth lock
+  // from the interpreter frame, the nth lock slot in the osr buffer is 0th lock
+  // in the interpreter frame (the method lock if a sync method)
+
+  // Initialize monitors in the compiled activation.
+  //   r1: pointer to osr buffer
+  //
+  // All other registers are dead at this point and the locals will be
+  // copied into place by code emitted in the IR.
+
+  Register OSR_buf = osrBufferPointer()->as_pointer_register();
+  { assert(frame::interpreter_frame_monitor_size() == BasicObjectLock::size(), "adjust code below");
+    int monitor_offset = BytesPerWord * method()->max_locals() +
+      (2 * BytesPerWord) * (number_of_locks - 1);
+    // SharedRuntime::OSR_migration_begin() packs BasicObjectLocks in
+    // the OSR buffer using 2 word entries: first the lock and then
+    // the oop.
+    for (int i = 0; i < number_of_locks; i++) {
+      int slot_offset = monitor_offset - ((i * 2) * BytesPerWord);
+#ifdef ASSERT
+      // verify the interpreter's monitor has a non-null object
+      {
+        Label L;
+        __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
+        __ cbnz(rscratch1, L);
+        __ stop("locked object is NULL");
+        __ bind(L);
+      }
+#endif
+      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 0));
+      __ str(rscratch1, frame_map()->address_for_monitor_lock(i));
+      __ ldr(rscratch1, Address(OSR_buf, slot_offset + 1*BytesPerWord));
+      __ str(rscratch1, frame_map()->address_for_monitor_object(i));
+    }
+  }
+}
+
+
+// inline cache check; done before the frame is built.
+int LIR_Assembler::check_icache() {
+  Register receiver = FrameMap::receiver_opr->as_register();
+  Register ic_klass = IC_Klass;
+  int start_offset = __ offset();
+  __ inline_cache_check(receiver, ic_klass);
+
+  // if icache check fails, then jump to runtime routine
+  // Note: RECEIVER must still contain the receiver!
+  Label dont;
+  __ b(dont, Assembler::EQ);
+  __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+  // We align the verified entry point unless the method body
+  // (including its inline cache check) will fit in a single 64-byte
+  // icache line.
+  if (! method()->is_accessor() || __ offset() - start_offset > 4 * 4) {
+    // force alignment after the cache check.
+    __ align(CodeEntryAlignment);
+  }
+
+  __ bind(dont);
+  return start_offset;
+}
+
+
+void LIR_Assembler::jobject2reg(jobject o, Register reg) {
+  if (o == NULL) {
+    __ mov(reg, 0);
+  } else {
+    __ movoop(reg, o, /*immediate*/true);
+  }
+}
+
+void LIR_Assembler::deoptimize_trap(CodeEmitInfo *info) {
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::deoptimize_id)));
+  add_call_info_here(info);
+}
+
+void LIR_Assembler::jobject2reg_with_patching(Register reg, CodeEmitInfo *info) {
+  PatchingStub* patch = new PatchingStub(_masm, patching_id(info));
+  __ relocate(oop_Relocation::spec(__ oop_recorder()->allocate_oop_index(NULL)));
+  __ patchable_load(reg, pc());
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+// Return sp decrement needed to build a frame
+int LIR_Assembler::initial_frame_size_in_bytes() const {
+  // We need to subtract size of saved backtrace structure in counting frame size
+  return in_bytes(frame_map()->framesize_in_bytes()) - frame::get_frame_size() * wordSize;
+}
+
+int LIR_Assembler::emit_exception_handler() {
+  // if the last instruction is a call (typically to do a throw which
+  // is coming at the end after block reordering) the return address
+  // must still point into the code area in order to avoid assertion
+  // failures when searching for the corresponding bci => add a nop
+  // (was bug 5/14/1999 - gri)
+  __ nop();
+
+  // generate code for exception handler
+  address handler_base = __ start_a_stub(exception_handler_size());
+  if (handler_base == NULL) {
+    // not enough space left for the handler
+    bailout("exception handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  // the exception oop and pc are in r0, and r3
+  // no other registers need to be preserved, so invalidate them
+  __ invalidate_registers(false, true, false);
+
+  // check that there is really an exception
+  __ verify_not_null_oop(r0);
+
+  // search an exception handler (r0: exception oop, r3: throwing pc)
+  __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::handle_exception_from_callee_id)));  __ should_not_reach_here();
+  guarantee(code_offset() - offset <= exception_handler_size(), "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+
+// Emit the code to remove the frame from the stack in the exception
+// unwind path.
+int LIR_Assembler::emit_unwind_handler() {
+#ifndef PRODUCT
+  if (CommentedAssembly) {
+    _masm->block_comment("Unwind handler");
+  }
+#endif
+
+  int offset = code_offset();
+
+  // Fetch the exception from TLS and clear out exception related thread state
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+
+  __ bind(_unwind_handler_entry);
+  __ verify_not_null_oop(r0);
+
+  // Preform needed unlocking
+  MonitorExitStub* stub = NULL;
+  if (method()->is_synchronized()) {
+    monitor_address(0, FrameMap::r1_opr);
+    stub = new MonitorExitStub(FrameMap::r1_opr, true, 0);
+    __ unlock_object(r5, r4, r1, *stub->entry());
+    __ bind(*stub->continuation());
+  }
+
+  if (compilation()->env()->dtrace_method_probes()) {
+    __ call_Unimplemented();
+#if 0
+    // FIXME check exception_store is not clobbered below!
+    __ movptr(Address(rsp, 0), rax);
+    __ mov_metadata(Address(rsp, sizeof(void*)), method()->constant_encoding());
+    __ call(RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit)));
+#endif
+  }
+
+  // remove the activation and dispatch to the unwind handler
+  __ block_comment("remove_frame and dispatch to the unwind handler");
+  __ remove_frame(initial_frame_size_in_bytes());
+  __ far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::unwind_exception_id)));
+
+  // Emit the slow path assembly
+  if (stub != NULL) {
+    stub->emit_code(this);
+  }
+
+  return offset;
+}
+
+
+int LIR_Assembler::emit_deopt_handler() {
+  // if the last instruction is a call (typically to do a throw which
+  // is coming at the end after block reordering) the return address
+  // must still point into the code area in order to avoid assertion
+  // failures when searching for the corresponding bci => add a nop
+  // (was bug 5/14/1999 - gri)
+  __ nop();
+
+  // generate code for exception handler
+  address handler_base = __ start_a_stub(deopt_handler_size());
+  if (handler_base == NULL) {
+    // not enough space left for the handler
+    bailout("deopt handler overflow");
+    return -1;
+  }
+
+  int offset = code_offset();
+
+  __ adr(lr, pc());
+  // deopt handler expects deopt pc already pushed to stack, since for C2
+  // it's not possible to allocate any register to hold the value
+  __ push(RegSet::of(lr), sp);
+  __ far_jump(RuntimeAddress(SharedRuntime::deopt_blob()->unpack()));
+  guarantee(code_offset() - offset <= deopt_handler_size(), "overflow");
+  __ end_a_stub();
+
+  return offset;
+}
+
+void LIR_Assembler::add_debug_info_for_branch(address adr, CodeEmitInfo* info) {
+  _masm->code_section()->relocate(adr, relocInfo::poll_type);
+  int pc_offset = code_offset();
+  flush_debug_info(pc_offset);
+  info->record_debug_info(compilation()->debug_info_recorder(), pc_offset);
+  if (info->exception_handlers() != NULL) {
+    compilation()->add_exception_handlers_for_pco(pc_offset, info->exception_handlers());
+  }
+}
+
+void LIR_Assembler::return_op(LIR_Opr result) {
+  assert(result->is_illegal() || !result->is_single_cpu() || result->as_register() == r0, "word returns are in r0,");
+  // Pop the stack before the safepoint code
+  __ remove_frame(initial_frame_size_in_bytes());
+
+  if (StackReservedPages > 0 && compilation()->has_reserved_stack_access()) {
+    __ reserved_stack_check();
+  }
+
+  address polling_page(os::get_polling_page());
+  __ read_polling_page(rscratch2, polling_page, relocInfo::poll_return_type);
+  __ ret(lr);
+}
+
+int LIR_Assembler::safepoint_poll(LIR_Opr tmp, CodeEmitInfo* info) {
+  address polling_page(os::get_polling_page());
+  guarantee(info != NULL, "Shouldn't be NULL");
+  assert(os::is_poll_address(polling_page), "should be");
+  __ get_polling_page(rscratch2, polling_page, relocInfo::poll_type);
+  add_debug_info_for_branch(info);  // This isn't just debug info:
+                                    // it's the oop map
+  __ read_polling_page(rscratch2, relocInfo::poll_type);
+  return __ offset();
+}
+
+void LIR_Assembler::move_regs(Register from_reg, Register to_reg) {
+  if (from_reg != to_reg) {
+    __ mov(to_reg, from_reg);
+  }
+}
+
+void LIR_Assembler::swap_reg(Register a, Register b) {
+  Unimplemented();
+}
+
+void LIR_Assembler::const2reg(LIR_Opr src, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info) {
+  assert(src->is_constant(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+  LIR_Const* c = src->as_constant_ptr();
+
+  switch (c->type()) {
+    case T_INT: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register(), c->as_jint_bits());
+      break;
+    }
+
+    case T_ADDRESS: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register(), c->as_jint());
+      break;
+    }
+
+    case T_LONG: {
+      assert(patch_code == lir_patch_none, "no patching handled here");
+      __ mov(dest->as_register_lo(), c->as_jint_lo_bits());
+      __ mov(dest->as_register_hi(), c->as_jint_hi_bits());
+      break;
+    }
+
+    case T_OBJECT: {
+        if (patch_code == lir_patch_none) {
+          jobject2reg(c->as_jobject(), dest->as_register());
+        } else {
+          jobject2reg_with_patching(dest->as_register(), info);
+        }
+      break;
+    }
+
+    case T_METADATA: {
+      if (patch_code != lir_patch_none) {
+        klass2reg_with_patching(dest->as_register(), info);
+      } else {
+        __ mov_metadata(dest->as_register(), c->as_metadata());
+      }
+      break;
+    }
+
+    case T_FLOAT: {
+        if(dest->is_single_fpu()) {
+            if (__ operand_valid_for_float_immediate(c->as_jfloat())) {
+                __ vmov_f32(dest->as_float_reg(), c->as_jfloat());
+            } else {
+                __ lea(rscratch1, InternalAddress(float_constant(c->as_jfloat())));
+                __ vldr_f32(dest->as_float_reg(), Address(rscratch1));
+            }
+        } else {
+            assert(patch_code == lir_patch_none, "no patching handled here");
+            __ mov(dest->as_register(), c->as_jint_bits());
+        }
+      break;
+    }
+
+    case T_DOUBLE: {
+        if(dest->is_double_fpu()) {
+            if (__ operand_valid_for_double_immediate(c->as_jdouble())) {
+                __ vmov_f64(dest->as_double_reg(), c->as_jdouble());
+            } else {
+                __ lea(rscratch1, InternalAddress(double_constant(c->as_jdouble())));
+                __ vldr_f64(dest->as_double_reg(), Address(rscratch1));
+            }
+        } else {
+            assert(patch_code == lir_patch_none, "no patching handled here");
+            __ mov(dest->as_register_lo(), c->as_jint_lo_bits());
+            __ mov(dest->as_register_hi(), c->as_jint_hi_bits());
+        }
+      break;
+    }
+
+    default:
+      ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::const2stack(LIR_Opr src, LIR_Opr dest) {
+  LIR_Const* c = src->as_constant_ptr();
+  switch (c->type()) {
+  case T_OBJECT:
+    {
+      if (! c->as_jobject()) {
+        __ mov(rscratch1, 0);
+        __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
+      } else {
+        const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
+        reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
+      }
+    }
+    break;
+  case T_ADDRESS:
+    {
+      const2reg(src, FrameMap::rscratch1_opr, lir_patch_none, NULL);
+      reg2stack(FrameMap::rscratch1_opr, dest, c->type(), false);
+    }
+  case T_INT:
+  case T_FLOAT:
+    {
+      __ mov(rscratch1, c->as_jint_bits());
+      __ str(rscratch1, frame_map()->address_for_slot(dest->single_stack_ix()));
+    }
+    break;
+  case T_LONG:
+  case T_DOUBLE:
+    {
+        __ mov(rscratch1, c->as_jint_lo());
+        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
+                                                        lo_word_offset_in_bytes));
+        if (c->as_jint_lo() != c->as_jint_hi())
+            __ mov(rscratch1, c->as_jint_hi());
+        __ str(rscratch1, frame_map()->address_for_slot(dest->double_stack_ix(),
+                                                        hi_word_offset_in_bytes));
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+/*
+ * For now this code can load only zero constants as in aarch32.
+ * It seems like this implementation can break some tests in future.
+ * TODO: ensure, write test, and rewrite if need.
+ */
+void LIR_Assembler::const2mem(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info, bool wide) {
+  assert(src->is_constant(), "should not call otherwise");
+  LIR_Const* c = src->as_constant_ptr();
+  LIR_Address* to_addr = dest->as_address_ptr();
+
+  void (Assembler::* insn)(Register Rt, const Address &adr, Assembler::Condition cnd);
+
+  __ mov(rscratch2, 0);
+
+  int null_check_here = code_offset();
+
+  Address::InsnDataType idt = Address::toInsnDataType(type);
+  switch (type) {
+  case T_ADDRESS:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_LONG: {
+    assert(c->as_jlong() == 0, "should be");
+    insn = &Assembler::str;
+    Address addr = as_Address_hi(to_addr, Address::IDT_INT);
+    null_check_here = code_offset();
+    __ str(rscratch2, addr);
+    idt = Address::IDT_INT;
+    break;
+  }
+  case T_INT:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_OBJECT:
+  case T_ARRAY:
+    assert(c->as_jobject() == 0, "should be");
+    insn = &Assembler::str;
+    break;
+  case T_CHAR:
+  case T_SHORT:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::strh;
+    break;
+  case T_BOOLEAN:
+  case T_BYTE:
+    assert(c->as_jint() == 0, "should be");
+    insn = &Assembler::strb;
+    break;
+  default:
+    ShouldNotReachHere();
+    insn = &Assembler::str;  // unreachable
+  }
+
+  (_masm->*insn)(rscratch2, as_Address(to_addr, idt), Assembler::C_DFLT);
+  if (info) add_debug_info_for_null_check(null_check_here, info);
+}
+
+void LIR_Assembler::reg2reg(LIR_Opr src, LIR_Opr dest) {
+  assert(src->is_register(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+
+  // move between cpu-registers
+  if (dest->is_single_cpu()) {
+    if (src->type() == T_LONG) {
+      // Can do LONG -> OBJECT
+      __ stop("investigate how \"LONG -> OBJECT\" works especially when high part is != 0");
+      move_regs(src->as_register_lo(), dest->as_register());
+      return;
+    }
+    if(src->is_single_fpu()) {
+        __ vmov_f32(dest->as_register(), src->as_float_reg());
+    } else {
+        assert(src->is_single_cpu(), "must match");
+        if (src->type() == T_OBJECT) {
+          __ verify_oop(src->as_register());
+        }
+        move_regs(src->as_register(), dest->as_register());
+    }
+  } else if (dest->is_double_cpu()) {
+      if(src->is_double_fpu()) {
+        __ vmov_f64(dest->as_register_lo(), dest->as_register_hi(), src->as_double_reg());
+      } else {
+        assert(src->is_double_cpu(), "must match");
+        Register f_lo = src->as_register_lo();
+        Register f_hi = src->as_register_hi();
+        Register t_lo = dest->as_register_lo();
+        Register t_hi = dest->as_register_hi();
+        assert(f_hi != f_lo, "must be different");
+        assert(t_hi != t_lo, "must be different");
+        check_register_collision(t_lo, &f_hi);
+        move_regs(f_lo, t_lo);
+        move_regs(f_hi, t_hi);
+      }
+  } else if (dest->is_single_fpu()) {
+      if(src->is_single_cpu()) {
+        __ vmov_f32(dest->as_float_reg(), src->as_register());
+      } else {
+        __ vmov_f32(dest->as_float_reg(), src->as_float_reg());
+      }
+  } else if (dest->is_double_fpu()) {
+      if(src->is_double_cpu()) {
+        __ vmov_f64(dest->as_double_reg(), src->as_register_lo(), src->as_register_hi());
+      } else {
+        __ vmov_f64(dest->as_double_reg(), src->as_double_reg());
+      }
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::reg2stack(LIR_Opr src, LIR_Opr dest, BasicType type, bool pop_fpu_stack) {
+  if (src->is_single_cpu()) {
+    if (type == T_ARRAY || type == T_OBJECT) {
+      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
+      __ verify_oop(src->as_register());
+    } else {
+      __ str(src->as_register(), frame_map()->address_for_slot(dest->single_stack_ix()));
+    }
+
+  } else if (src->is_double_cpu()) {
+    Address dest_addr_LO = frame_map()->address_for_slot(dest->double_stack_ix(), lo_word_offset_in_bytes);
+    __ strd(src->as_register_lo(), src->as_register_hi(), dest_addr_LO);
+  } else if (src->is_single_fpu()) {
+    Address dest_addr = frame_map()->address_for_slot(dest->single_stack_ix());
+    __ vstr_f32(src->as_float_reg(), dest_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
+  } else if (src->is_double_fpu()) {
+    Address dest_addr = frame_map()->address_for_slot(dest->double_stack_ix());
+    __ vstr_f64(src->as_double_reg(), dest_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
+  } else {
+    ShouldNotReachHere();
+  }
+
+}
+
+
+void LIR_Assembler::reg2mem(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool pop_fpu_stack, bool wide, bool /* unaligned */) {
+  LIR_Address* to_addr = dest->as_address_ptr();
+
+  if (type == T_ARRAY || type == T_OBJECT) {
+    __ verify_oop(src->as_register());
+  }
+
+  PatchingStub* patch = NULL;
+  if (patch_code != lir_patch_none) {
+    assert(to_addr->disp() != 0, "must have");
+
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+    address const_addr = __ address_constant(0);
+    if (!const_addr) BAILOUT("patchable offset");
+    __ relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
+    __ patchable_load(rscratch1, const_addr);
+    patching_epilog(patch, patch_code, to_addr->base()->as_register(), info);
+
+    to_addr = new LIR_Address(to_addr->base(), FrameMap::rscratch1_opr, to_addr->type());
+  }
+
+
+  int null_check_here = code_offset();
+  switch (type) {
+    case T_FLOAT:
+        if(src->is_single_fpu()) {
+            Address addr = as_Address(to_addr, Address::IDT_FLOAT);
+            null_check_here = code_offset();
+            __ vstr_f32(src->as_float_reg(), addr);
+            break;
+        } // fall through at FPUless system
+    case T_ARRAY:   // fall through
+    case T_OBJECT:  // fall through
+    case T_ADDRESS: // fall though
+    case T_INT: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ str(src->as_register(), addr);
+      break;
+    }
+    case T_METADATA:
+      // We get here to store a method pointer to the stack to pass to
+      // a dtrace runtime call. This can't work on 64 bit with
+      // compressed klass ptrs: T_METADATA can be a compressed klass
+      // ptr or a 64 bit method pointer.
+      ShouldNotReachHere();
+//      __ str(src->as_register(), as_Address(to_addr));
+      break;
+
+    case T_DOUBLE:
+        if(src->is_double_fpu()) {
+            Address addr = as_Address(to_addr, Address::IDT_DOUBLE);
+            null_check_here = code_offset();
+            __ vstr_f64(src->as_double_reg(), addr);
+            break;
+        } // fall through at FPUless system
+    case T_LONG: {
+      Address addr = as_Address_lo(to_addr, Address::IDT_LONG);
+      null_check_here = code_offset();
+      null_check_here += __ strd(src->as_register_lo(), src->as_register_hi(), addr);
+      break;
+    }
+
+    case T_BYTE:    // fall through
+    case T_BOOLEAN: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ strb(src->as_register(), addr);
+      break;
+    }
+    case T_CHAR:    // fall through
+    case T_SHORT: {
+      Address addr = as_Address(to_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ strh(src->as_register(), addr);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_here, info);
+  }
+}
+
+
+void LIR_Assembler::stack2reg(LIR_Opr src, LIR_Opr dest, BasicType type) {
+  assert(src->is_stack(), "should not call otherwise");
+  assert(dest->is_register(), "should not call otherwise");
+
+  if (dest->is_single_cpu()) {
+    if (type == T_ARRAY || type == T_OBJECT) {
+      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
+      __ verify_oop(dest->as_register());
+    } else {
+      __ ldr(dest->as_register(), frame_map()->address_for_slot(src->single_stack_ix()));
+    }
+
+  } else if (dest->is_double_cpu()) {
+    Address src_addr_LO = frame_map()->address_for_slot(src->double_stack_ix(), lo_word_offset_in_bytes);
+    __ ldrd(dest->as_register_lo(), dest->as_register_hi(), src_addr_LO);
+  } else if (dest->is_single_fpu()) {
+    Address src_addr = frame_map()->address_for_slot(src->single_stack_ix());
+    __ vldr_f32(dest->as_float_reg(), src_addr.safe_for(Address::IDT_FLOAT, _masm, rscratch1));
+  } else if (dest->is_double_fpu()) {
+    Address src_addr = frame_map()->address_for_slot(src->double_stack_ix());
+    __ vldr_f64(dest->as_double_reg(), src_addr.safe_for(Address::IDT_DOUBLE, _masm, rscratch1));
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::klass2reg_with_patching(Register reg, CodeEmitInfo* info) {
+  PatchingStub* patch = new PatchingStub(_masm, PatchingStub::load_klass_id);
+  __ relocate(metadata_Relocation::spec(__ oop_recorder()->allocate_metadata_index(NULL)));
+  __ patchable_load(reg, pc());
+  patching_epilog(patch, lir_patch_normal, reg, info);
+}
+
+void LIR_Assembler::stack2stack(LIR_Opr src, LIR_Opr dest, BasicType type) {
+
+  LIR_Opr temp;
+  if (type == T_LONG || type == T_DOUBLE)
+    temp = FrameMap::rscratch_long_opr;
+  else
+    temp = FrameMap::rscratch1_opr;
+
+  stack2reg(src, temp, src->type());
+  reg2stack(temp, dest, dest->type(), false);
+}
+
+
+void LIR_Assembler::mem2reg(LIR_Opr src, LIR_Opr dest, BasicType type, LIR_PatchCode patch_code, CodeEmitInfo* info, bool wide, bool /* unaligned */) {
+  LIR_Address* from_addr = src->as_address_ptr();
+
+  if (from_addr->base()->type() == T_OBJECT) {
+    __ verify_oop(from_addr->base()->as_pointer_register());
+  }
+
+  PatchingStub* patch = NULL;
+  if (patch_code != lir_patch_none) {
+    assert(from_addr->disp() != 0, "must have");
+
+    patch = new PatchingStub(_masm, PatchingStub::access_field_id);
+    address const_addr = __ address_constant(0);
+    if (!const_addr) BAILOUT("patchable offset");
+    __ relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
+    __ patchable_load(rscratch1, const_addr);
+    patching_epilog(patch, patch_code, from_addr->base()->as_register(), info);
+
+    from_addr = new LIR_Address(from_addr->base(), FrameMap::rscratch1_opr, from_addr->type());
+  }
+
+  int null_check_here = code_offset();
+
+  switch (type) {
+    case T_FLOAT:
+        if(dest->is_single_fpu()){
+            Address addr = as_Address(from_addr, Address::IDT_FLOAT);
+            null_check_here = code_offset();
+            __ vldr_f32(dest->as_float_reg(), addr);
+              break;
+        }  // fall through at FPUless systems
+    case T_ARRAY:   // fall through
+    case T_OBJECT:  // fall through
+    case T_ADDRESS: // fall through
+    case T_INT: {
+      Address addr = as_Address(from_addr, Address::toInsnDataType(type));
+      null_check_here = code_offset();
+      __ ldr(dest->as_register(), addr);
+      break;
+    }
+    case T_METADATA:
+      // We get here to store a method pointer to the stack to pass to
+      // a dtrace runtime call. This can't work on 64 bit with
+      // compressed klass ptrs: T_METADATA can be a compressed klass
+      // ptr or a 64 bit method pointer.
+      ShouldNotReachHere();
+//      __ ldr(dest->as_register(), as_Address(from_addr));
+      break;
+    case T_DOUBLE:
+        if(dest->is_double_fpu()){
+            Address addr = as_Address(from_addr, Address::IDT_DOUBLE);
+            null_check_here = code_offset();
+            __ vldr_f64(dest->as_double_reg(), addr);
+              break;
+        } // fall through at FPUless systems
+    case T_LONG: {
+      Address addr = as_Address_lo(from_addr, Address::IDT_LONG);
+      null_check_here = code_offset();
+      null_check_here += __ ldrd(dest->as_register_lo(), dest->as_register_hi(), addr);
+      break;
+    }
+
+    case T_BYTE: {
+      Address addr =  as_Address(from_addr, Address::IDT_BYTE);
+      null_check_here = code_offset();
+      __ ldrsb(dest->as_register(), addr);
+      break;
+    }
+    case T_BOOLEAN: {
+      Address addr = as_Address(from_addr, Address::IDT_BOOLEAN);
+      null_check_here = code_offset();
+      __ ldrb(dest->as_register(), addr);
+      break;
+    }
+
+    case T_CHAR: {
+      Address addr = as_Address(from_addr, Address::IDT_CHAR);
+      null_check_here = code_offset();
+      __ ldrh(dest->as_register(), addr);
+      break;
+    }
+    case T_SHORT: {
+      Address addr = as_Address(from_addr, Address::IDT_SHORT);
+      null_check_here = code_offset();
+      __ ldrsh(dest->as_register(), addr);
+      break;
+    }
+
+    default:
+      ShouldNotReachHere();
+  }
+
+  if (type == T_ARRAY || type == T_OBJECT) {
+    __ verify_oop(dest->as_register());
+  }
+
+  if (info != NULL) {
+    add_debug_info_for_null_check(null_check_here, info);
+  }
+}
+
+
+int LIR_Assembler::array_element_size(BasicType type) const {
+  int elem_size = type2aelembytes(type);
+  return exact_log2(elem_size);
+}
+
+void LIR_Assembler::emit_op3(LIR_Op3* op) {
+  Register Rdividend = op->in_opr1()->as_register();
+  Register Rdivisor  = op->in_opr2()->as_register();
+  Register Rscratch  = op->in_opr3()->as_register();
+  Register Rresult   = op->result_opr()->as_register();
+  int divisor = -1;
+
+  /*
+  TODO: For some reason, using the Rscratch that gets passed in is
+  not possible because the register allocator does not see the tmp reg
+  as used, and assignes it the same register as Rdividend. We use rscratch1
+   instead.
+
+  assert(Rdividend != Rscratch, "");
+  assert(Rdivisor  != Rscratch, "");
+  */
+
+  if (Rdivisor == noreg && is_power_of_2(divisor)) {
+    // convert division by a power of two into some shifts and logical operations
+  }
+
+  assert(op->code() == lir_irem || op->code() == lir_idiv, "should be irem or idiv");
+  bool want_remainder = op->code() == lir_irem;
+
+  __ divide(Rresult, Rdividend, Rdivisor, 32, want_remainder);
+}
+
+void LIR_Assembler::emit_opBranch(LIR_OpBranch* op) {
+#ifdef ASSERT
+  assert(op->block() == NULL || op->block()->label() == op->label(), "wrong label");
+  if (op->block() != NULL)  _branch_target_blocks.append(op->block());
+  if (op->ublock() != NULL) _branch_target_blocks.append(op->ublock());
+#endif
+
+  if (op->cond() == lir_cond_always) {
+    if (op->info() != NULL) add_debug_info_for_branch(op->info());
+    __ b(*(op->label()));
+  } else {
+    Assembler::Condition acond;
+    if (op->code() == lir_cond_float_branch) {
+      bool is_unordered = (op->ublock() == op->block());
+      // Assembler::EQ does not permit unordered branches, so we add
+      // another branch here.  Likewise, Assembler::NE does not permit
+      // ordered branches.
+      if (is_unordered && op->cond() == lir_cond_equal
+          || !is_unordered && op->cond() == lir_cond_notEqual)
+        __ b(*(op->ublock()->label()), Assembler::VS);
+      switch(op->cond()) {
+      case lir_cond_equal:        acond = Assembler::EQ; break;
+      case lir_cond_notEqual:     acond = Assembler::NE; break;
+      case lir_cond_less:         acond = (is_unordered ? Assembler::LT : Assembler::LO); break;
+      case lir_cond_lessEqual:    acond = (is_unordered ? Assembler::LE : Assembler::LS); break;
+      case lir_cond_greaterEqual: acond = (is_unordered ? Assembler::HS : Assembler::GE); break;
+      case lir_cond_greater:      acond = (is_unordered ? Assembler::HI : Assembler::GT); break;
+      default:                    ShouldNotReachHere();
+        acond = Assembler::EQ;  // unreachable
+      }
+    } else {
+      switch (op->cond()) {
+        case lir_cond_equal:        acond = Assembler::EQ; break;
+        case lir_cond_notEqual:     acond = Assembler::NE; break;
+        case lir_cond_less:         acond = Assembler::LT; break;
+        case lir_cond_greaterEqual: acond = Assembler::GE; break;
+        case lir_cond_lessEqual:    acond = Assembler::LE; break;
+        case lir_cond_greater:      acond = Assembler::GT; break;
+        case lir_cond_belowEqual:   acond = Assembler::LS; break;
+        case lir_cond_aboveEqual:   acond = Assembler::HS; break;
+        default:                         ShouldNotReachHere();
+          acond = Assembler::EQ;  // unreachable
+      }
+      if (op->type() == T_LONG) {
+        // a special trick here to be able to effectively compare jlongs
+        // for the lessEqual and greater conditions the jlong operands are swapped
+        // during comparison and hence should use mirror condition in conditional
+        // instruction
+        // see LIR_Assembler::comp_op and LIR_Assembler::cmove
+        switch (op->cond()) {
+          case lir_cond_lessEqual: acond = Assembler::GE;  break;
+          case lir_cond_greater:   acond = Assembler::LT;    break;
+        }
+      }
+    }
+    __ b(*(op->label()), acond);
+  }
+}
+
+FloatRegister LIR_Assembler::as_float_reg(LIR_Opr doubleReg) {
+    assert(doubleReg->is_double_fpu(), "must be f64");
+    return as_FloatRegister(doubleReg->fpu_regnrLo());
+}
+
+void LIR_Assembler::emit_opConvert(LIR_OpConvert* op) {
+  LIR_Opr src  = op->in_opr();
+  LIR_Opr dest = op->result_opr();
+
+  switch (op->bytecode()) {
+    case Bytecodes::_i2f:
+      {
+        __ vmov_f32(dest->as_float_reg(), src->as_register());
+        __ vcvt_f32_s32(dest->as_float_reg(), dest->as_float_reg());
+        break;
+      }
+    case Bytecodes::_i2d:
+      {
+        __ vmov_f32(as_float_reg(dest), src->as_register());
+        __ vcvt_f64_s32(dest->as_double_reg(), as_float_reg(dest));
+        break;
+      }
+    case Bytecodes::_f2d:
+      {
+        __ vcvt_f64_f32(dest->as_double_reg(), src->as_float_reg());
+        break;
+      }
+    case Bytecodes::_d2f:
+      {
+        __ vcvt_f32_f64(dest->as_float_reg(), src->as_double_reg());
+        break;
+      }
+    case Bytecodes::_i2c:
+      {
+        __ uxth(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_i2l:
+      {
+        const Register dst_hi = dest->as_register_hi();
+        const Register dst_lo = dest->as_register_lo();
+        const Register src_lo = as_reg(src);
+        __ mov(dst_lo, src_lo);
+        __ asr(dst_hi, src_lo, 31);
+        break;
+      }
+    case Bytecodes::_i2s:
+      {
+        __ sxth(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_i2b:
+      {
+        __ sxtb(dest->as_register(), src->as_register());
+        break;
+      }
+    case Bytecodes::_l2i:
+      {
+        assert(dest->is_single_cpu(), "must be single register");
+        __ mov(dest->as_register(), src->as_register_lo());
+        break;
+      }
+    case Bytecodes::_f2i:
+      {
+        __ vcvt_s32_f32(src->as_float_reg(), src->as_float_reg());
+        __ vmov_f32(dest->as_register(), src->as_float_reg());
+        break;
+      }
+    case Bytecodes::_d2i:
+      {
+        __ vcvt_s32_f64(as_float_reg(src), src->as_double_reg());
+        __ vmov_f32(dest->as_register(), as_float_reg(src));
+        break;
+      }
+    default: ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::emit_alloc_obj(LIR_OpAllocObj* op) {
+  if (op->init_check()) {
+    __ ldrb(rscratch1, Address(op->klass()->as_register(),
+                               InstanceKlass::init_state_offset()));
+    __ cmp(rscratch1, InstanceKlass::fully_initialized);
+    add_debug_info_for_null_check_here(op->stub()->info());
+    __ b(*op->stub()->entry(), Assembler::NE);
+  }
+  __ allocate_object(op->obj()->as_register(),
+                     op->tmp1()->as_register(),
+                     op->tmp2()->as_register(),
+                     op->header_size(),
+                     op->object_size(),
+                     op->klass()->as_register(),
+                     *op->stub()->entry());
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::emit_alloc_array(LIR_OpAllocArray* op) {
+  Register len =  as_reg(op->len());
+
+  if (UseSlowPath ||
+      (!UseFastNewObjectArray && (op->type() == T_OBJECT || op->type() == T_ARRAY)) ||
+      (!UseFastNewTypeArray   && (op->type() != T_OBJECT && op->type() != T_ARRAY))) {
+    __ b(*op->stub()->entry());
+  } else {
+    Register tmp1 = op->tmp1()->as_register();
+    Register tmp2 = op->tmp2()->as_register();
+    Register tmp3 = op->tmp3()->as_register();
+    if (len == tmp1) {
+      tmp1 = tmp3;
+    } else if (len == tmp2) {
+      tmp2 = tmp3;
+    } else if (len == tmp3) {
+      // everything is ok
+    } else {
+      __ mov(tmp3, len);
+    }
+    __ allocate_array(op->obj()->as_register(),
+                      len,
+                      tmp1,
+                      tmp2,
+                      arrayOopDesc::header_size(op->type()),
+                      array_element_size(op->type()),
+                      op->klass()->as_register(),
+                      *op->stub()->entry());
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+void LIR_Assembler::type_profile_helper(Register mdo,
+                                        ciMethodData *md, ciProfileData *data,
+                                        Register recv, Label* update_done) {
+  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
+    Label next_test;
+    // See if the receiver is receiver[n].
+    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
+    __ ldr(rscratch1, Address(rscratch2));
+    __ cmp(recv, rscratch1);
+    __ b(next_test, Assembler::NE);
+    Address data_addr(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i)));
+    __ addptr(data_addr, DataLayout::counter_increment);
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+
+  // Didn't find receiver; find next empty slot and fill it in
+  for (uint i = 0; i < ReceiverTypeData::row_limit(); i++) {
+    Label next_test;
+    __ lea(rscratch2,
+           Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_offset(i))));
+    Address recv_addr(rscratch2);
+    __ ldr(rscratch1, recv_addr);
+    __ cbnz(rscratch1, next_test);
+    __ str(recv, recv_addr);
+    __ mov(rscratch1, DataLayout::counter_increment);
+    __ lea(rscratch2, Address(mdo, md->byte_offset_of_slot(data, ReceiverTypeData::receiver_count_offset(i))));
+    __ str(rscratch1, Address(rscratch2));
+    __ b(*update_done);
+    __ bind(next_test);
+  }
+}
+
+void LIR_Assembler::emit_typecheck_helper(LIR_OpTypeCheck *op, Label* success, Label* failure, Label* obj_is_null) {
+  // we always need a stub for the failure case.
+  CodeStub* stub = op->stub();
+  Register obj = op->object()->as_register();
+  Register k_RInfo = op->tmp1()->as_register();
+  Register klass_RInfo = op->tmp2()->as_register();
+  Register dst = op->result_opr()->as_register();
+  ciKlass* k = op->klass();
+  Register Rtmp1 = noreg;
+
+  // check if it needs to be profiled
+  ciMethodData* md;
+  ciProfileData* data;
+
+  const bool should_profile = op->should_profile();
+  if (should_profile) {
+    ciMethod* method = op->profiled_method();
+    assert(method != NULL, "Should have method");
+    int bci = op->profiled_bci();
+    md = method->method_data_or_null();
+    assert(md != NULL, "Sanity");
+    data = md->bci_to_data(bci);
+    assert(data != NULL,                "need data for type check");
+    assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
+  }
+  Label profile_cast_success, profile_cast_failure;
+  Label *success_target = op->should_profile() ? &profile_cast_success : success;
+  Label *failure_target = op->should_profile() ? &profile_cast_failure : failure;
+
+  if (obj == k_RInfo) {
+    k_RInfo = dst;
+  } else if (obj == klass_RInfo) {
+    klass_RInfo = dst;
+  }
+  if (k->is_loaded()) {
+    select_different_registers(obj, dst, k_RInfo, klass_RInfo);
+  } else {
+    Rtmp1 = op->tmp3()->as_register();
+    select_different_registers(obj, dst, k_RInfo, klass_RInfo, Rtmp1);
+  }
+
+  assert_different_registers(obj, k_RInfo, klass_RInfo);
+
+  if (should_profile) {
+    Label not_null;
+    __ cbnz(obj, not_null);
+    // Object is null; update MDO and exit
+    Register mdo  = klass_RInfo;
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address data_addr
+        = __ form_address(rscratch2, mdo,
+                          md->byte_offset_of_slot(data, DataLayout::flags_offset()),
+                          0);
+      __ ldrb(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant());
+      __ strb(rscratch1, data_addr);
+      __ b(*obj_is_null);
+      __ bind(not_null);
+    } else {
+    __ cbz(obj, *obj_is_null);
+  }
+
+  if (!k->is_loaded()) {
+    klass2reg_with_patching(k_RInfo, op->info_for_patch());
+  } else {
+    __ mov_metadata(k_RInfo, k->constant_encoding());
+  }
+  __ verify_oop(obj);
+
+  if (op->fast_check()) {
+    // get object class
+    // not a safepoint as obj null check happens earlier
+    __ load_klass(rscratch1, obj);
+    __ cmp( rscratch1, k_RInfo);
+
+    __ b(*failure_target, Assembler::NE);
+    // successful cast, fall through to profile or jump
+  } else {
+    // get object class
+    // not a safepoint as obj null check happens earlier
+    __ load_klass(klass_RInfo, obj);
+    if (k->is_loaded()) {
+      // See if we get an immediate positive hit
+      __ ldr(rscratch1, Address(klass_RInfo, long(k->super_check_offset())));
+      __ cmp(k_RInfo, rscratch1);
+      if ((juint)in_bytes(Klass::secondary_super_cache_offset()) != k->super_check_offset()) {
+        __ b(*failure_target, Assembler::NE);
+        // successful cast, fall through to profile or jump
+      } else {
+        // See if we get an immediate positive hit
+        __ b(*success_target, Assembler::EQ);
+        // check for self
+        __ cmp(klass_RInfo, k_RInfo);
+        __ b(*success_target, Assembler::EQ);
+
+        __ push(klass_RInfo);
+        __ push(k_RInfo);
+        __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+        __ ldr(klass_RInfo, Address(__ post(sp, 2 * wordSize)));
+
+        // result is a boolean
+        __ cbz(klass_RInfo, *failure_target);
+        // successful cast, fall through to profile or jump
+      }
+    } else {
+      // perform the fast part of the checking logic
+      __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
+      // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+      __ push(klass_RInfo);
+      __ push(k_RInfo);
+      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+      __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));
+
+      // result is a boolean
+      __ cbz(k_RInfo, *failure_target);
+      // successful cast, fall through to profile or jump
+    }
+  }
+  if (should_profile) {
+    Register mdo  = klass_RInfo, recv = k_RInfo;
+    __ bind(profile_cast_success);
+    __ mov_metadata(mdo, md->constant_encoding());
+    __ load_klass(recv, obj);
+    Label update_done;
+    type_profile_helper(mdo, md, data, recv, success);
+    __ b(*success);
+
+    __ bind(profile_cast_failure);
+    __ mov_metadata(mdo, md->constant_encoding());
+    Address counter_addr
+      = __ form_address(rscratch2, mdo,
+                        md->byte_offset_of_slot(data, CounterData::count_offset()),
+                        0);
+    __ ldr(rscratch1, counter_addr);
+    __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
+    __ str(rscratch1, counter_addr);
+    __ b(*failure);
+  }
+  __ b(*success);
+}
+
+
+void LIR_Assembler::emit_opTypeCheck(LIR_OpTypeCheck* op) {
+  LIR_Code code = op->code();
+  if (code == lir_store_check) {
+    Register value = op->object()->as_register();
+    Register array = op->array()->as_register();
+    Register k_RInfo = op->tmp1()->as_register();
+    Register klass_RInfo = op->tmp2()->as_register();
+    Register Rtmp1 = op->tmp3()->as_register();
+
+    CodeStub* stub = op->stub();
+
+    // check if it needs to be profiled
+    ciMethodData* md;
+    ciProfileData* data;
+
+    const bool should_profile = op->should_profile();
+    if (should_profile) {
+      ciMethod* method = op->profiled_method();
+      assert(method != NULL, "Should have method");
+      int bci = op->profiled_bci();
+      md = method->method_data_or_null();
+      assert(md != NULL, "Sanity");
+      data = md->bci_to_data(bci);
+      assert(data != NULL,                "need data for type check");
+      assert(data->is_ReceiverTypeData(), "need ReceiverTypeData for type check");
+    }
+    Label profile_cast_success, profile_cast_failure, done;
+    Label *success_target = should_profile ? &profile_cast_success : &done;
+    Label *failure_target = should_profile ? &profile_cast_failure : stub->entry();
+
+    if (should_profile) {
+      Label not_null;
+      __ cbnz(value, not_null);
+      // Object is null; update MDO and exit
+      Register mdo  = klass_RInfo;
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address data_addr
+        = __ form_address(rscratch2, mdo,
+                          md->byte_offset_of_slot(data, DataLayout::flags_offset()),
+                          0);
+      __ ldrb(rscratch1, data_addr);
+      __ orr(rscratch1, rscratch1, BitData::null_seen_byte_constant());
+      __ strb(rscratch1, data_addr);
+      __ b(done);
+      __ bind(not_null);
+    } else {
+      __ cbz(value, done);
+    }
+
+    add_debug_info_for_null_check_here(op->info_for_exception());
+    __ load_klass(k_RInfo, array);
+    __ load_klass(klass_RInfo, value);
+
+    // get instance klass (it's already uncompressed)
+    __ ldr(k_RInfo, Address(k_RInfo, ObjArrayKlass::element_klass_offset()));
+    // perform the fast part of the checking logic
+    __ check_klass_subtype_fast_path(klass_RInfo, k_RInfo, Rtmp1, success_target, failure_target, NULL);
+    // call out-of-line instance of __ check_klass_subtype_slow_path(...):
+    __ push(klass_RInfo);
+    __ push(k_RInfo);
+    __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+    __ ldr(k_RInfo, Address(__ post(sp, 2 * wordSize)));
+    // result is a boolean
+    __ cbz(k_RInfo, *failure_target);
+    // fall through to the success case
+
+    if (should_profile) {
+      Register mdo  = klass_RInfo, recv = k_RInfo;
+      __ bind(profile_cast_success);
+      __ mov_metadata(mdo, md->constant_encoding());
+      __ load_klass(recv, value);
+      type_profile_helper(mdo, md, data, recv, &done);
+      __ b(done);
+
+      __ bind(profile_cast_failure);
+      __ mov_metadata(mdo, md->constant_encoding());
+      Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
+      __ lea(rscratch2, counter_addr);
+      __ ldr(rscratch1, Address(rscratch2));
+      __ sub(rscratch1, rscratch1, DataLayout::counter_increment);
+      __ str(rscratch1, Address(rscratch2));
+      __ b(*stub->entry());
+    }
+
+    __ bind(done);
+  } else if (code == lir_checkcast) {
+    Register obj = op->object()->as_register();
+    Register dst = op->result_opr()->as_register();
+    Label success;
+    emit_typecheck_helper(op, &success, op->stub()->entry(), &success);
+    __ bind(success);
+    if (dst != obj) {
+      __ mov(dst, obj);
+    }
+  } else if (code == lir_instanceof) {
+    Register obj = op->object()->as_register();
+    Register dst = op->result_opr()->as_register();
+    Label success, failure, done;
+    emit_typecheck_helper(op, &success, &failure, &failure);
+    __ bind(failure);
+    __ mov(dst, 0);
+    __ b(done);
+    __ bind(success);
+    __ mov(dst, 1);
+    __ bind(done);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// TODO: reuse masm cmpxchgw
+void LIR_Assembler::casw(Register addr, Register newval, Register cmpval, Register result) {
+  assert(newval != cmpval, "must be different");
+  Label retry_load, nope;
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  __ bind(retry_load);
+  __ ldrex(result, addr);
+  __ cmp(result, cmpval);
+  __ mov(result, 1, Assembler::NE);
+  __ b(nope, Assembler::NE);
+  // if we store+flush with no intervening write rscratch1 wil be zero
+  __ strex(result, newval, addr);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  __ cbnz(result, retry_load);
+  __ membar(__ AnyAny);
+  __ bind(nope);
+}
+
+void LIR_Assembler::casl(Register addr, Register newval_lo, Register newval_hi, Register cmpval_lo,  Register cmpval_hi,  Register tmp_lo, Register tmp_hi, Register result) {
+  assert(newval_lo->successor() == newval_hi, "must be contiguous");
+  assert(tmp_lo->successor() == tmp_hi, "must be contiguous");
+  assert(tmp_lo->encoding_nocheck() % 2 == 0,  "Must be an even register");
+  assert_different_registers(newval_lo, newval_hi, cmpval_lo,  cmpval_hi,  tmp_lo, tmp_hi);
+
+  Label retry_load, nope;
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  __ bind(retry_load);
+  __ mov(result, 1);
+  __ ldrexd(tmp_lo, addr);
+  __ cmp(tmp_lo, cmpval_lo);
+  __ b(nope, Assembler::NE);
+  __ cmp(tmp_hi, cmpval_hi);
+  __ b(nope, Assembler::NE);
+  // if we store+flush with no intervening write rscratch1 wil be zero
+  __ strexd(result, newval_lo, addr);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  __ cbnz(result, retry_load);
+  __ membar(__ AnyAny);
+  __ bind(nope);
+}
+
+
+void LIR_Assembler::emit_compare_and_swap(LIR_OpCompareAndSwap* op) {
+  Register addr;
+  if (op->addr()->is_register()) {
+    addr = as_reg(op->addr());
+  } else {
+    assert(op->addr()->is_address(), "what else?");
+    LIR_Address* addr_ptr = op->addr()->as_address_ptr();
+    assert(addr_ptr->disp() == 0, "need 0 disp");
+    assert(addr_ptr->index() == LIR_OprDesc::illegalOpr(), "need 0 index");
+    addr = as_reg(addr_ptr->base());
+  }
+  Register result = as_reg(op->result_opr());
+  if (op->code() == lir_cas_obj || op->code() == lir_cas_int) {
+    Register newval = as_reg(op->new_value());
+    Register cmpval = as_reg(op->cmp_value());
+    casw(addr, newval, cmpval, result);
+  } else if (op->code() == lir_cas_long){
+    Register newval_lo = op->new_value()->as_register_lo();
+    Register newval_hi = op->new_value()->as_register_hi();
+    Register cmpval_lo = op->cmp_value()->as_register_lo();
+    Register cmpval_hi = op->cmp_value()->as_register_hi();
+    Register tmp_lo = op->tmp1()->as_register_lo();
+    Register tmp_hi = op->tmp1()->as_register_hi();
+    casl(addr, newval_lo, newval_hi, cmpval_lo, cmpval_hi, tmp_lo, tmp_hi, result);
+  } else {
+      ShouldNotReachHere();
+  }
+}
+
+static void patch_condition(address start_insn, address end_insn, Assembler::Condition cond) {
+  for (uint32_t* insn_p = (uint32_t*) start_insn; (address) insn_p < end_insn; ++insn_p) {
+    uint32_t insn = *insn_p;
+    assert((insn >> 28) == Assembler::AL, "instructions in patch"
+     " should allow conditional form and be in ALWAYS condition");
+    *insn_p = (insn & 0x0fffffff) | (cond << 28);
+  }
+}
+
+void LIR_Assembler::cmove(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Opr result, BasicType type) {
+
+  Assembler::Condition acond, ncond;
+  switch (condition) {
+  case lir_cond_equal:        acond = Assembler::EQ; ncond = Assembler::NE; break;
+  case lir_cond_notEqual:     acond = Assembler::NE; ncond = Assembler::EQ; break;
+  case lir_cond_less:         acond = Assembler::LT; ncond = Assembler::GE; break;
+  case lir_cond_greaterEqual: acond = Assembler::GE; ncond = Assembler::LT; break;
+  case lir_cond_lessEqual:    acond = Assembler::LE; ncond = Assembler::GT; break;
+  case lir_cond_greater:      acond = Assembler::GT; ncond = Assembler::LE; break;
+  case lir_cond_belowEqual:   Unimplemented(); return;
+  case lir_cond_aboveEqual:   Unimplemented(); return;
+  default:                    ShouldNotReachHere(); return;
+  }
+  if (type == T_LONG) {
+      // for the lessEqual and greater conditions the jlong operands are swapped
+      // during comparison and hence should use mirror condition in conditional
+      // instruction. see comp_op())
+    switch (condition) {
+    case lir_cond_lessEqual:    acond = Assembler::GE; ncond = Assembler::LT; break;
+    case lir_cond_greater:      acond = Assembler::LT; ncond = Assembler::GE; break;
+    }
+  }
+
+  address true_instrs = __ pc();
+  if (opr1->is_cpu_register()) {
+    reg2reg(opr1, result);
+  } else if (opr1->is_stack()) {
+    stack2reg(opr1, result, result->type());
+  } else if (opr1->is_constant()) {
+    const2reg(opr1, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+  patch_condition(true_instrs, __ pc(), acond);
+
+  address false_instrs = __ pc();
+  if (opr2->is_cpu_register()) {
+    reg2reg(opr2, result);
+  } else if (opr2->is_stack()) {
+    stack2reg(opr2, result, result->type());
+  } else if (opr2->is_constant()) {
+    const2reg(opr2, result, lir_patch_none, NULL);
+  } else {
+    ShouldNotReachHere();
+  }
+  patch_condition(false_instrs, __ pc(), ncond);
+}
+
+void LIR_Assembler::arith_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dest, CodeEmitInfo* info, bool pop_fpu_stack) {
+  assert(info == NULL, "should never be used, idiv/irem and ldiv/lrem not handled by this method");
+
+  if (left->is_single_cpu()) {
+    assert(left->type() != T_FLOAT, "expect integer type");
+    assert(right->type() != T_FLOAT, "expect integer type");
+    assert(dest->type() != T_FLOAT, "expect integer type");
+
+    Register lreg = left->as_register();
+    Register dreg = as_reg(dest);
+
+    if (right->is_single_cpu()) {
+      // cpu register - cpu register
+
+      assert((left->type() == T_INT || left->type() == T_OBJECT)
+             && right->type() == T_INT
+             && dest->type() == T_INT,
+             "should be");
+      Register rreg = right->as_register();
+      switch (code) {
+      case lir_add: __ add (dest->as_register(), lreg, rreg); break;
+      case lir_sub: __ sub (dest->as_register(), lreg, rreg); break;
+      case lir_mul: __ mul (dest->as_register(), lreg, rreg); break;
+      default:      ShouldNotReachHere();
+      }
+
+    } else if (right->is_double_cpu()) {
+      ShouldNotReachHere(); // for obj+long op the generator casts long to int before invoking add
+    } else if (right->is_constant()) {
+      // cpu register - constant
+      jint c = right->as_constant_ptr()->as_jint();
+
+      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
+      if (c == 0 && dreg == lreg) {
+        COMMENT("effective nop elided");
+        return;
+      }
+
+      if (Assembler::operand_valid_for_add_sub_immediate(c)) {
+        switch (code) {
+        case lir_add: __ add(dreg, lreg, c); break;
+        case lir_sub: __ sub(dreg, lreg, c); break;
+        default: ShouldNotReachHere();
+        }
+      } else {
+        __ mov(rscratch1, c);
+        switch (code) {
+        case lir_add: __ add(dreg, lreg, rscratch1); break;
+        case lir_sub: __ sub(dreg, lreg, rscratch1); break;
+        default: ShouldNotReachHere();
+        }
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+
+  } else if (left->is_double_cpu()) {
+    assert(left->type() != T_DOUBLE, "expect integer type");
+    assert(right->type() != T_DOUBLE, "expect integer type");
+    assert(dest->type() != T_DOUBLE, "expect integer type");
+
+    Register lreg_lo = left->as_register_lo();
+    Register lreg_hi = left->as_register_hi();
+
+    if (right->is_double_cpu()) {
+      // cpu register - cpu register
+      Register rreg_lo = right->as_register_lo();
+      Register rreg_hi = right->as_register_hi();
+      Register dreg_lo = dest->as_register_lo();
+      Register dreg_hi = dest->as_register_hi();
+      if (code == lir_add || code == lir_sub) {
+        check_register_collision(dreg_lo, &lreg_hi, &rreg_hi);
+      }
+      switch (code) {
+      case lir_add: __ adds (dreg_lo, lreg_lo, rreg_lo);
+                    __ adc (dreg_hi, lreg_hi, rreg_hi); break;
+      case lir_sub: __ subs (dreg_lo, lreg_lo, rreg_lo);
+                    __ sbc (dreg_hi, lreg_hi, rreg_hi); break;
+      case lir_mul: __ mult_long (dreg_lo, dreg_hi,
+                        lreg_lo, lreg_hi, rreg_lo, rreg_hi); break;
+      default:
+        ShouldNotReachHere();
+      }
+
+    } else if (right->is_constant()) {
+      const jint c_lo = right->as_constant_ptr()->as_jint_lo_bits();
+      const jint c_hi = right->as_constant_ptr()->as_jint_hi_bits();
+      const Register dreg_lo = dest->as_register_lo();
+      const Register dreg_hi = dest->as_register_hi();
+      assert(code == lir_add || code == lir_sub, "mismatched arithmetic op");
+      if (c_lo == 0 && c_hi == 0 && dreg_lo == lreg_lo && dreg_hi == lreg_hi) {
+        COMMENT("effective nop elided");
+        return;
+      }
+      check_register_collision(dreg_lo, &lreg_hi, NULL, rscratch2);
+      switch (code) {
+        case lir_add:
+          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
+            __ adds(dreg_lo, lreg_lo, c_lo);
+          else {
+            __ mov(rscratch1, c_lo);
+            __ adds(dreg_lo, lreg_lo, rscratch1);
+          }
+          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
+            __ adc(dreg_hi, lreg_hi, c_hi);
+          else {
+            __ mov(rscratch1, c_hi);
+            __ adc(dreg_lo, lreg_hi, rscratch1);
+          }
+          break;
+        case lir_sub:
+          if (Assembler::operand_valid_for_add_sub_immediate(c_lo))
+            __ subs(dreg_lo, lreg_lo, c_lo);
+          else {
+            __ mov(rscratch1, c_lo);
+            __ subs(dreg_lo, lreg_lo, rscratch1);
+          }
+          if (Assembler::operand_valid_for_add_sub_immediate(c_hi))
+            __ sbc(dreg_hi, lreg_hi, c_hi);
+          else {
+            __ mov(rscratch1, c_hi);
+            __ sbc(dreg_hi, lreg_hi, rscratch1);
+          }
+          break;
+        default:
+          ShouldNotReachHere();
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (left->is_single_fpu()) {
+    assert(right->is_single_fpu(), "right hand side of float arithmetics needs to be float register");
+    switch (code) {
+    case lir_add: __ vadd_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_sub: __ vsub_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_mul: __ vmul_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    case lir_div: __ vdiv_f32 (dest->as_float_reg(), left->as_float_reg(), right->as_float_reg()); break;
+    default:
+      ShouldNotReachHere();
+    }
+  } else if (left->is_double_fpu()) {
+    if (right->is_double_fpu()) {
+      // cpu register - cpu register
+      switch (code) {
+      case lir_add: __ vadd_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_sub: __ vsub_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_mul: __ vmul_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      case lir_div: __ vdiv_f64 (dest->as_double_reg(), left->as_double_reg(), right->as_double_reg()); break;
+      default:
+        ShouldNotReachHere();
+      }
+    } else {
+      if (right->is_constant()) {
+        ShouldNotReachHere();
+      }
+      ShouldNotReachHere();
+    }
+  } else if (left->is_single_stack() || left->is_address()) {
+    assert(left == dest, "left and dest must be equal");
+    ShouldNotReachHere();
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::intrinsic_op(LIR_Code code, LIR_Opr value, LIR_Opr unused, LIR_Opr dest, LIR_Op* op) {
+  switch(code) {
+  case lir_abs : __ vabs_f64(dest->as_double_reg(), value->as_double_reg()); break;
+  case lir_sqrt: __ vsqrt_f64(dest->as_double_reg(), value->as_double_reg()); break;
+  default      : ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::logic_op(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst) {
+
+  assert(left->is_single_cpu() || left->is_double_cpu(), "expect single or double register");
+  Register Rleft = left->is_single_cpu() ? left->as_register() :
+                                           left->as_register_lo();
+   if (dst->is_single_cpu()) {
+     Register Rdst = dst->as_register();
+     if (right->is_constant()) {
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, right->as_jint()); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, right->as_jint()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       Register Rright = right->is_single_cpu() ? right->as_register() :
+                                                  right->as_register_lo();
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst, Rleft, Rright); break;
+         case lir_logic_or:  __ orr (Rdst, Rleft, Rright); break;
+         case lir_logic_xor: __ eor (Rdst, Rleft, Rright); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   } else {
+     assert(dst->is_double_cpu(), "mismatched logic op operand size");
+     const Register Rdst_lo = dst->as_register_lo();
+     const Register Rdst_hi = dst->as_register_hi();
+     Register Rleft_hi = left->as_register_hi();
+     if (right->is_constant()) {
+       // LIR generator enforces jlong constants to be valid_immediate12
+       // so we know they fit into 32-bit int
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         case lir_logic_or:  __ orr (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         case lir_logic_xor: __ eor (Rdst_lo, Rleft, (int)right->as_jlong()); break;
+         default: ShouldNotReachHere(); break;
+       }
+     } else {
+       assert(right->is_double_cpu(), "mismatched logic op operand size");
+       Register Rright_lo = right->as_register_lo();
+       Register Rright_hi = right->as_register_hi();
+       check_register_collision(Rdst_lo, &Rleft_hi, &Rright_hi);
+       switch (code) {
+         case lir_logic_and: __ andr (Rdst_lo, Rleft, Rright_lo);
+                             __ andr (Rdst_hi, Rleft_hi, Rright_hi); break;
+         case lir_logic_or:  __ orr (Rdst_lo, Rleft, Rright_lo);
+                             __ orr (Rdst_hi, Rleft_hi, Rright_hi); break;
+         case lir_logic_xor: __ eor (Rdst_lo, Rleft, Rright_lo);
+                             __ eor (Rdst_hi, Rleft_hi, Rright_hi); break;
+         default: ShouldNotReachHere(); break;
+       }
+     }
+   }
+}
+
+
+
+void LIR_Assembler::arithmetic_idiv(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr temp, LIR_Opr result, CodeEmitInfo* info) { Unimplemented(); }
+
+void LIR_Assembler::comp_op(LIR_Condition condition, LIR_Opr opr1, LIR_Opr opr2, LIR_Op2* op) {
+  if (opr1->is_single_cpu()) {
+
+    assert(opr1->type() != T_FLOAT, "expect integer type");// softfp guard
+    assert(opr2->type() != T_FLOAT, "expect integer type");
+
+    Register reg1 = as_reg(opr1);
+    if (opr2->is_single_cpu()) {
+      // cpu register - cpu register
+      Register reg2 = opr2->as_register();
+      if (opr1->type() == T_OBJECT || opr1->type() == T_ARRAY) {
+        __ cmpoop(reg1, reg2);
+      } else {
+        assert(opr2->type() != T_OBJECT && opr2->type() != T_ARRAY, "cmp int, oop?");
+        __ cmp(reg1, reg2);
+      }
+    } else if (opr2->is_constant()) {
+      LIR_Const* c = opr2->as_constant_ptr();
+      if (c->type() == T_INT) {
+        __ cmp(reg1, c->as_jint(), rscratch1, Assembler::C_DFLT);
+      } else if (c->type() == T_OBJECT || c->type() == T_ARRAY) {
+        jobject o = c->as_jobject();
+        if (o == NULL) {
+          __ cmp(reg1, (int32_t)NULL_WORD);
+        } else {
+          jobject2reg(o, rscratch1);
+          __ cmpoop(reg1, rscratch1);
+        }
+      } else {
+        fatal("unexpected type: %s", basictype_to_str(c->type()));
+      }
+    } else if (opr2->is_address()) {
+      __ ldr(rscratch2, as_Address(opr2->as_address_ptr(), rscratch1, Address::IDT_INT));
+      __ cmp(reg1, rscratch2);
+    } else {
+      ShouldNotReachHere();
+    }
+
+  } else if (opr1->is_double_cpu()) {
+    assert(opr1->type() == T_LONG, "expect jlong type");
+    assert(opr2->type() == T_LONG, "expect jlong type");
+    Register xlo = opr1->as_register_lo();
+    Register xhi = opr1->as_register_hi();
+    if (opr2->is_double_cpu()) {
+      // cpu register - cpu register
+      Register ylo = opr2->as_register_lo();
+      Register yhi = opr2->as_register_hi();
+      switch (condition) {
+        case lir_cond_equal:
+        case lir_cond_notEqual:
+        case lir_cond_belowEqual:
+        case lir_cond_aboveEqual:
+          // these need APSR.ZC. the ops below set them correctly (but not APSR.V)
+          __ cmp(xhi, yhi);
+          __ cmp(xlo, ylo, Assembler::EQ);
+          break;
+        case lir_cond_less:
+        case lir_cond_greaterEqual:
+          __ cmp(xlo, ylo);
+          __ sbcs(rscratch1, xhi, yhi);
+          break;
+        case lir_cond_lessEqual:
+        case lir_cond_greater:
+          // here goes a trick: the below operations do not produce the valid
+          // value for the APSR.Z flag and there is no easy way to set it. so
+          // we exchange the order of arguments in the comparison and use the
+          // opposite condition in the conditional statement that follows.
+          // GE should be used instead of LE and LT in place of GT.
+          // the comp_op() could only be followed by: emit_opBranch(), cmove() and
+          // emit_assert(). these are patched to be aware of this trick
+          __ cmp(ylo, xlo);
+          __ sbcs(rscratch1, yhi, xhi);
+          break;
+      }
+    } else if (opr2->is_constant()) {
+      jlong y = opr2->as_jlong();
+      assert(Assembler::operand_valid_for_add_sub_immediate(y), "immediate overflow");
+      switch (condition) {
+        case lir_cond_equal:
+        case lir_cond_notEqual:
+        case lir_cond_belowEqual:
+        case lir_cond_aboveEqual:
+          __ cmp(xhi, (int)(y >> 32));
+          __ cmp(xlo, (int)y, Assembler::EQ);
+          break;
+        case lir_cond_less:
+        case lir_cond_greaterEqual:
+          __ cmp(xlo, (int)y);
+          __ sbcs(rscratch1, xhi, (int)(y >> 32));
+          break;
+        case lir_cond_lessEqual:
+        case lir_cond_greater:
+          __ rsbs(rscratch1, xlo, (int)y);
+          __ rscs(rscratch1, xhi, (int)(y >> 32));
+          break;
+      }
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (opr1->is_single_fpu()) {
+    FloatRegister reg1 = opr1->as_float_reg();
+    assert(opr2->is_single_fpu(), "expect single float register");
+    FloatRegister reg2 = opr2->as_float_reg();
+    __ vcmp_f32(reg1, reg2);
+    __ get_fpsr();
+  } else if (opr1->is_double_fpu()) {
+    FloatRegister reg1 = opr1->as_double_reg();
+    assert(opr2->is_double_fpu(), "expect double float register");
+    FloatRegister reg2 = opr2->as_double_reg();
+    __ vcmp_f64(reg1, reg2);
+    __ get_fpsr();
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void LIR_Assembler::comp_fl2i(LIR_Code code, LIR_Opr left, LIR_Opr right, LIR_Opr dst, LIR_Op2* op){
+  if (code == lir_cmp_fd2i || code == lir_ucmp_fd2i) {
+    bool is_unordered_less = (code == lir_ucmp_fd2i);
+    if (left->is_single_fpu()) {
+      __ float_cmp(true, is_unordered_less ? -1 : 1, left->as_float_reg(), right->as_float_reg(), dst->as_register());
+    } else if (left->is_double_fpu()) {
+      __ float_cmp(false, is_unordered_less ? -1 : 1, left->as_double_reg(), right->as_double_reg(), dst->as_register());
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (code == lir_cmp_l2i) {
+    __ mov(dst->as_register(), 1);
+    __ subs(rscratch1, left->as_register_lo(), right->as_register_lo());
+    __ sbc(rscratch2, left->as_register_hi(), right->as_register_hi());
+    __ orrs(rscratch1, rscratch1, rscratch2);
+    __ mov(dst->as_register(), -1, Assembler::MI);
+    __ mov(dst->as_register(), 0, Assembler::EQ);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::align_call(LIR_Code code) {  }
+
+
+void LIR_Assembler::call(LIR_OpJavaCall* op, relocInfo::relocType rtype) {
+  __ trampoline_call(Address(op->addr(), rtype));
+  add_call_info(code_offset(), op->info());
+}
+
+
+void LIR_Assembler::ic_call(LIR_OpJavaCall* op) {
+  __ ic_call(op->addr());
+  add_call_info(code_offset(), op->info());
+}
+
+
+/* Currently, vtable-dispatch is only enabled for sparc platforms */
+void LIR_Assembler::vtable_call(LIR_OpJavaCall* op) {
+  ShouldNotReachHere();
+}
+
+
+void LIR_Assembler::emit_static_call_stub() {
+  address call_pc = __ pc();
+  address stub = __ start_a_stub(call_stub_size());
+  if (stub == NULL) {
+    bailout("static call stub overflow");
+    return;
+  }
+
+  int start = __ offset();
+
+  __ relocate(static_stub_Relocation::spec(call_pc));
+  __ mov_metadata(rmethod, (Metadata*)NULL);
+  __ movptr(rscratch1, 0);
+  __ b(rscratch1);
+
+  assert(__ offset() - start <= call_stub_size(), "stub too big");
+  __ end_a_stub();
+}
+
+
+void LIR_Assembler::throw_op(LIR_Opr exceptionPC, LIR_Opr exceptionOop, CodeEmitInfo* info) {
+  assert(exceptionOop->as_register() == r0, "must match");
+  assert(exceptionPC->as_register() == r3, "must match");
+
+  // exception object is not added to oop map by LinearScan
+  // (LinearScan assumes that no oops are in fixed registers)
+  info->add_register_oop(exceptionOop);
+  Runtime1::StubID unwind_id;
+
+  // get current pc information
+  // pc is only needed if the method has an exception handler, the unwind code does not need it.
+  int pc_for_athrow_offset = __ offset();
+  __ add(exceptionPC->as_register(), r15_pc, -8);
+  add_call_info(pc_for_athrow_offset, info); // for exception handler
+
+  __ verify_not_null_oop(r0);
+  // search an exception handler (r0: exception oop, r3: throwing pc)
+  if (compilation()->has_fpu_code()) {
+    unwind_id = Runtime1::handle_exception_id;
+  } else {
+    unwind_id = Runtime1::handle_exception_nofpu_id;
+  }
+  __ far_call(RuntimeAddress(Runtime1::entry_for(unwind_id)));
+
+  // FIXME: enough room for two byte trap   ????
+  __ nop();
+}
+
+
+void LIR_Assembler::unwind_op(LIR_Opr exceptionOop) {
+  assert(exceptionOop->as_register() == r0, "must match");
+
+  __ b(_unwind_handler_entry);
+}
+
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, LIR_Opr count, LIR_Opr dest, LIR_Opr tmp) {
+  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();
+  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();
+
+  switch (left->type()) {
+    case T_INT:
+    case T_ADDRESS:
+    case T_OBJECT:
+      __ andr(rscratch1, count->as_register(), 0x1f);
+      switch (code) {
+        case lir_shl: __ lsl(dreg, lreg, rscratch1); break;
+        case lir_shr: __ asr(dreg, lreg, rscratch1); break;
+        case lir_ushr: __ lsr(dreg, lreg, rscratch1); break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+      break;
+    case T_LONG:
+    {
+      Register lreg_hi = left->as_register_hi();
+      Register dreg_hi = dest->as_register_hi();
+      const int word_bits = 8 * wordSize;
+
+      if (code == lir_shl || code == lir_ushr) {
+        check_register_collision(dreg, &lreg, &lreg_hi, rscratch1);
+        check_register_collision(dreg_hi, &lreg, &lreg_hi, rscratch2);
+      }
+
+      switch (code) {
+        case lir_shl:
+          __ andr(dreg, count->as_register(), 0x3f);
+          __ sub(dreg_hi, dreg, word_bits);
+          __ lsl(lreg_hi, lreg_hi, dreg);
+          __ orr(lreg_hi, lreg_hi, lreg, lsl(dreg_hi));
+          __ rsb(dreg_hi, dreg, word_bits);
+          __ orr(dreg_hi, lreg_hi, lreg, lsr(dreg_hi));
+          __ lsl(dreg, lreg, dreg);
+          break;
+        case lir_shr: {
+          __ mov(rscratch2, lreg_hi);
+          __ andr(rscratch1, count->as_register(), 0x3f);
+          __ lsr(dreg, lreg, rscratch1);
+          __ rsb(dreg_hi, rscratch1, word_bits);
+          __ orr(dreg, dreg, rscratch2, lsl(dreg_hi));
+          __ asr(dreg_hi, rscratch2, rscratch1);
+          __ subs(rscratch1, rscratch1, word_bits);
+          __ mov(dreg, rscratch2, asr(rscratch1), Assembler::GT);
+        }
+          break;
+        case lir_ushr:
+          __ andr(dreg, count->as_register(), 0x3f);
+          __ lsr(lreg, lreg, dreg);
+          __ rsb(dreg_hi, dreg, word_bits);
+          __ orr(lreg, lreg, lreg_hi, lsl(dreg_hi));
+          __ lsr(dreg_hi, lreg_hi, dreg);
+          __ sub(dreg, dreg, word_bits);
+          __ orr(dreg, lreg, lreg_hi, lsr(dreg));
+          break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+    }
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+  }
+}
+
+
+void LIR_Assembler::shift_op(LIR_Code code, LIR_Opr left, jint count, LIR_Opr dest) {
+  Register dreg = dest->is_single_cpu() ? dest->as_register() : dest->as_register_lo();
+  Register lreg = left->is_single_cpu() ? left->as_register() : left->as_register_lo();
+
+  if (!count) {
+      reg2reg(left, dest);
+      return;
+   }
+
+  switch (left->type()) {
+    case T_INT:
+    case T_ADDRESS:
+    case T_OBJECT:
+      switch (code) {
+        case lir_shl: __ lsl(dreg, lreg, count); break;
+        case lir_shr: __ asr(dreg, lreg, count); break;
+        case lir_ushr: __ lsr(dreg, lreg, count); break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+      break;
+    case T_LONG: {
+      Register lreg_hi = left->as_register_hi();
+      Register dreg_hi = dest->as_register_hi();
+      const int word_bits = 8 * wordSize;
+
+      switch (code) {
+        case lir_shl:
+          if (count >= word_bits) {
+            __ lsl(dreg_hi, lreg, count - word_bits);
+            __ mov(dreg, 0);
+          } else {
+            check_register_collision(dreg_hi, &lreg);
+            __ lsl(dreg_hi, lreg_hi, count);
+            __ orr(dreg_hi, dreg_hi, lreg, lsr(word_bits - count));
+            __ lsl(dreg, lreg, count);
+          }
+          break;
+        case lir_shr:
+          if (count >= word_bits) {
+            __ asr(dreg, lreg_hi, count - word_bits);
+            __ asr(dreg_hi, lreg_hi, word_bits);
+          } else {
+            check_register_collision(dreg, &lreg_hi);
+            __ lsr(dreg, lreg, count);
+            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
+            __ asr(dreg_hi, lreg_hi, count);
+          }
+          break;
+        case lir_ushr:
+          if (count >= word_bits) {
+            __ lsr(dreg, lreg_hi, count - word_bits);
+            __ mov(dreg_hi, 0);
+          } else {
+            check_register_collision(dreg, &lreg_hi);
+            __ lsr(dreg, lreg, count);
+            __ orr(dreg, dreg, lreg_hi, lsl(word_bits - count));
+            __ lsr(dreg_hi, lreg_hi, count);
+          }
+          break;
+        default:
+          ShouldNotReachHere();
+          break;
+      }
+    }
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+  }
+}
+
+
+void LIR_Assembler::store_parameter(Register r, int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
+  __ str (r, Address(sp, offset_from_sp_in_bytes));
+}
+
+
+void LIR_Assembler::store_parameter(jint c,     int offset_from_sp_in_words) {
+  assert(offset_from_sp_in_words >= 0, "invalid offset from sp");
+  int offset_from_sp_in_bytes = offset_from_sp_in_words * BytesPerWord;
+  assert(offset_from_sp_in_bytes < frame_map()->reserved_argument_area_size(), "invalid offset");
+  __ mov (rscratch1, c);
+  __ str (rscratch1, Address(sp, offset_from_sp_in_bytes));
+}
+
+// This code replaces a call to arraycopy; no exception may
+// be thrown in this code, they must be thrown in the System.arraycopy
+// activation frame; we could save some checks if this would not be the case
+void LIR_Assembler::emit_arraycopy(LIR_OpArrayCopy* op) {
+  ciArrayKlass* default_type = op->expected_type();
+  Register src = op->src()->as_register();
+  Register dst = op->dst()->as_register();
+  Register src_pos = op->src_pos()->as_register();
+  Register dst_pos = op->dst_pos()->as_register();
+  Register length  = op->length()->as_register();
+  Register tmp = op->tmp()->as_register();
+  // due to limited number of registers available and in order to simplify
+  // the code we fix the registers used by the arguments to this intrinsic.
+  // see the comment in LIRGenerator::do_ArrayCopy
+  assert(src == j_rarg0, "assumed by implementation");
+  assert(src_pos == j_rarg1, "assumed by implementation");
+  assert(dst == j_rarg2, "assumed by implementation");
+  assert(dst_pos == j_rarg3, "assumed by implementation");
+  assert(length == r4, "assumed by implementation");
+  assert(tmp == r5, "assumed by implementation");
+  const int dst_spill_offset = 2*BytesPerWord;
+  const int dst_pos_spill_offset = 1*BytesPerWord;
+  const int length_spill_offset = 0*BytesPerWord;
+  const int src_pos_spill_offset = 3*BytesPerWord;
+  const int src_spill_offset = 4*BytesPerWord;
+
+  CodeStub* stub = op->stub();
+  int flags = op->flags();
+  BasicType basic_type = default_type != NULL ? default_type->element_type()->basic_type() : T_ILLEGAL;
+  if (basic_type == T_ARRAY) basic_type = T_OBJECT;
+
+  // if we don't know anything, just go through the generic arraycopy
+  if (default_type == NULL // || basic_type == T_OBJECT
+      ) {
+    Label done;
+    assert(src == r0 && src_pos == r1, "mismatch in calling convention");
+
+    // Save the arguments in case the generic arraycopy fails and we
+    // have to fall back to the JNI stub. spill all but length since it's
+    // in the callee save register
+    __ str(dst,     Address(sp, dst_spill_offset));
+    __ str(dst_pos, Address(sp, dst_pos_spill_offset));
+    __ str(src_pos, Address(sp, src_pos_spill_offset));
+    __ str(src,     Address(sp, src_spill_offset));
+
+    address copyfunc_addr = StubRoutines::generic_arraycopy();
+    assert(copyfunc_addr != NULL, "generic arraycopy stub required");
+
+    // The arguments are in java calling convention so we shift them
+    // to C convention
+    assert(c_rarg0 == j_rarg0, "assumed in the code below");
+    // the below C function follows C calling convention,
+    // so should put 5th arg to stack
+    assert(length_spill_offset == 0, "assumed in the code below");
+    __ str(length,  Address(sp));
+
+#ifndef PRODUCT
+    if (PrintC1Statistics) {
+      __ increment(ExternalAddress((address)&Runtime1::_generic_arraycopystub_cnt));
+    }
+#endif
+    __ far_call(RuntimeAddress(copyfunc_addr));
+
+    __ cbz(r0, *stub->continuation());
+
+    // r0 is -1^K where K == partial copied count
+    __ inv(rscratch1, r0);
+
+    // Reload values from the stack so they are where the stub
+    // expects them. don't reload length since it's in the callee-save register
+    // and the value on the stack might have been modified by the C function
+    __ ldr(dst,     Address(sp, dst_spill_offset));
+    __ ldr(dst_pos, Address(sp, dst_pos_spill_offset));
+    __ ldr(src_pos, Address(sp, src_pos_spill_offset));
+    __ ldr(src,     Address(sp, src_spill_offset));
+
+    // adjust length down and src/end pos up by partial copied count
+    __ sub(length, length, rscratch1);
+    __ add(src_pos, src_pos, rscratch1);
+    __ add(dst_pos, dst_pos, rscratch1);
+
+    __ b(*stub->entry());
+
+    __ bind(*stub->continuation());
+    return;
+  }
+
+  assert(default_type != NULL && default_type->is_array_klass() && default_type->is_loaded(), "must be true at this point");
+
+  int elem_size = type2aelembytes(basic_type);
+  int scale = exact_log2(elem_size);
+
+  Address src_length_addr = Address(src, arrayOopDesc::length_offset_in_bytes());
+  Address dst_length_addr = Address(dst, arrayOopDesc::length_offset_in_bytes());
+  Address src_klass_addr = Address(src, oopDesc::klass_offset_in_bytes());
+  Address dst_klass_addr = Address(dst, oopDesc::klass_offset_in_bytes());
+
+  // test for NULL
+  if (flags & LIR_OpArrayCopy::src_null_check) {
+    __ cbz(src, *stub->entry());
+  }
+  if (flags & LIR_OpArrayCopy::dst_null_check) {
+    __ cbz(dst, *stub->entry());
+  }
+
+  // If the compiler was not able to prove that exact type of the source or the destination
+  // of the arraycopy is an array type, check at runtime if the source or the destination is
+  // an instance type.
+  if (flags & LIR_OpArrayCopy::type_check) {
+    if (!(flags & LIR_OpArrayCopy::LIR_OpArrayCopy::dst_objarray)) {
+      __ load_klass(tmp, dst);
+      __ ldr(rscratch1, Address(tmp, in_bytes(Klass::layout_helper_offset())));
+      __ cmp(rscratch1, Klass::_lh_neutral_value);
+      __ b(*stub->entry(), Assembler::GE);
+    }
+
+    if (!(flags & LIR_OpArrayCopy::LIR_OpArrayCopy::src_objarray)) {
+      __ load_klass(tmp, src);
+      __ ldr(rscratch1, Address(tmp, in_bytes(Klass::layout_helper_offset())));
+      __ cmp(rscratch1, Klass::_lh_neutral_value);
+      __ b(*stub->entry(), Assembler::GE);
+    }
+  }
+
+
+  // check if negative
+  if (flags & LIR_OpArrayCopy::src_pos_positive_check) {
+    __ cmp(src_pos, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+  if (flags & LIR_OpArrayCopy::dst_pos_positive_check) {
+    __ cmp(dst_pos, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+
+  if (flags & LIR_OpArrayCopy::length_positive_check) {
+    __ cmp(length, 0);
+    __ b(*stub->entry(), Assembler::LT);
+  }
+
+  if (flags & LIR_OpArrayCopy::src_range_check) {
+    __ add(tmp, src_pos, length);
+    __ ldr(rscratch1, src_length_addr);
+    __ cmp(tmp, rscratch1);
+    __ b(*stub->entry(), Assembler::HI);
+  }
+  if (flags & LIR_OpArrayCopy::dst_range_check) {
+    __ add(tmp, dst_pos, length);
+    __ ldr(rscratch1, dst_length_addr);
+    __ cmp(tmp, rscratch1);
+    __ b(*stub->entry(), Assembler::HI);
+  }
+
+  // FIXME: The logic in LIRGenerator::arraycopy_helper clears
+  // length_positive_check if the source of our length operand is an
+  // arraylength.  However, that arraylength might be zero, and the
+  // stub that we're about to call contains an assertion that count !=
+  // 0 .  So we make this check purely in order not to trigger an
+  // assertion failure.
+  __ cbz(length, *stub->continuation());
+
+  if (flags & LIR_OpArrayCopy::type_check) {
+    // We don't know the array types are compatible
+    if (basic_type != T_OBJECT) {
+      // Simple test for basic type arrays
+      __ ldr(tmp, src_klass_addr);
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(*stub->entry(), Assembler::NE);
+    } else {
+      // For object arrays, if src is a sub class of dst then we can
+      // safely do the copy.
+      Label cont, slow;
+
+      __ push(RegSet::of(src, dst), sp);
+
+      __ load_klass(src, src);
+      __ load_klass(dst, dst);
+
+      __ check_klass_subtype_fast_path(src, dst, tmp, &cont, &slow, NULL);
+
+      __ push(src); // sub
+      __ push(dst); // super
+      __ far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::slow_subtype_check_id)));
+      // result on TOS
+      __ pop(src); // result
+      __ pop(dst);
+
+      __ cbnz(src, cont);
+
+      __ bind(slow);
+      __ pop(RegSet::of(src, dst), sp);
+
+      address copyfunc_addr = StubRoutines::checkcast_arraycopy();
+      if (copyfunc_addr != NULL) { // use stub if available
+        // src is not a sub class of dst so we have to do a
+        // per-element check.
+
+        int mask = LIR_OpArrayCopy::src_objarray|LIR_OpArrayCopy::dst_objarray;
+        if ((flags & mask) != mask) {
+          // Check that at least both of them object arrays.
+          assert(flags & mask, "one of the two should be known to be an object array");
+
+          if (!(flags & LIR_OpArrayCopy::src_objarray)) {
+            __ load_klass(tmp, src);
+          } else if (!(flags & LIR_OpArrayCopy::dst_objarray)) {
+            __ load_klass(tmp, dst);
+          }
+          int lh_offset = in_bytes(Klass::layout_helper_offset());
+          Address klass_lh_addr(tmp, lh_offset);
+          jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
+          __ ldr(rscratch1, klass_lh_addr);
+          __ mov(rscratch2, objArray_lh);
+          __ eor(rscratch1, rscratch1, rscratch2);
+          __ cbnz(rscratch1, *stub->entry());
+        }
+
+        // Spill because stub destroys r0-r3.
+        assert(length_spill_offset == 0, "assumed in the code below");
+        assert(length == r4, "shall not be in r0-r3");
+        __ str(dst_pos, Address(sp, dst_pos_spill_offset));
+        __ str(dst,     Address(sp, dst_spill_offset));
+        __ str(src_pos, Address(sp, src_pos_spill_offset));
+        __ str(src,     Address(sp, src_spill_offset));
+
+        __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
+        __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
+        assert_different_registers(c_rarg0, dst, dst_pos, length);
+        __ lea(c_rarg1, Address(dst, dst_pos, lsl(scale)));
+        __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
+        assert_different_registers(c_rarg1, dst, dst_pos, length);
+
+        __ load_klass(c_rarg2, dst);
+        __ ldr(c_rarg2, Address(c_rarg2, ObjArrayKlass::element_klass_offset()));
+        __ ldr(c_rarg3, Address(c_rarg2, Klass::super_check_offset_offset()));
+        __ str(c_rarg2, Address(sp));
+        __ mov(c_rarg2, length);
+        __ far_call(RuntimeAddress(copyfunc_addr));
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          Label failed;
+          __ cbnz(r0, failed);
+          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_cnt));
+          __ bind(failed);
+        }
+#endif
+
+        __ cbz(r0, *stub->continuation());
+
+#ifndef PRODUCT
+        if (PrintC1Statistics) {
+          __ increment(ExternalAddress((address)&Runtime1::_arraycopy_checkcast_attempt_cnt));
+        }
+#endif
+        assert_different_registers(dst, dst_pos, length, src_pos, src, rscratch1);
+        // return value is -1^K where K is partial copied count
+        __ mvn(rscratch1, r0);
+
+        // Restore previously spilled arguments
+        __ ldr(dst_pos, Address(sp, dst_pos_spill_offset));
+        __ ldr(dst,     Address(sp, dst_spill_offset));
+        __ ldr(src_pos, Address(sp, src_pos_spill_offset));
+        __ ldr(src,     Address(sp, src_spill_offset));
+
+        // adjust length down and src/end pos up by partial copied count
+        __ sub(length, length, rscratch1);
+        __ add(src_pos, src_pos, rscratch1);
+        __ add(dst_pos, dst_pos, rscratch1);
+      }
+
+      __ b(*stub->entry());
+
+      __ bind(cont);
+      __ pop(RegSet::of(src, dst), sp);
+    }
+  }
+
+#ifdef ASSERT
+  if (basic_type != T_OBJECT || !(flags & LIR_OpArrayCopy::type_check)) {
+    // Sanity check the known type with the incoming class.  For the
+    // primitive case the types must match exactly with src.klass and
+    // dst.klass each exactly matching the default type.  For the
+    // object array case, if no type check is needed then either the
+    // dst type is exactly the expected type and the src type is a
+    // subtype which we can't check or src is the same array as dst
+    // but not necessarily exactly of type default_type.
+    Label known_ok, halt;
+    __ mov_metadata(tmp, default_type->constant_encoding());
+
+    if (basic_type != T_OBJECT) {
+
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(halt, Assembler::NE);
+      __ ldr(rscratch1, src_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(known_ok, Assembler::EQ);
+    } else {
+      __ ldr(rscratch1, dst_klass_addr);
+      __ cmp(tmp, rscratch1);
+      __ b(known_ok, Assembler::EQ);
+      __ cmp(src, dst);
+      __ b(known_ok, Assembler::EQ);
+    }
+    __ bind(halt);
+    __ stop("incorrect type information in arraycopy");
+    __ bind(known_ok);
+  }
+#endif
+
+  __ lea(c_rarg0, Address(src, src_pos, lsl(scale)));
+  __ add(c_rarg0, c_rarg0, arrayOopDesc::base_offset_in_bytes(basic_type));
+  assert_different_registers(c_rarg0, dst, dst_pos, length);
+  __ lea(c_rarg1, Address(dst, dst_pos, lsl(scale)));
+  __ add(c_rarg1, c_rarg1, arrayOopDesc::base_offset_in_bytes(basic_type));
+  assert_different_registers(c_rarg1, dst, dst_pos, length);
+  __ mov(c_rarg2, length);
+
+  bool disjoint = (flags & LIR_OpArrayCopy::overlapping) == 0;
+  bool aligned = (flags & LIR_OpArrayCopy::unaligned) == 0;
+  const char *name;
+  address entry = StubRoutines::select_arraycopy_function(basic_type, aligned, disjoint, name, false);
+
+  CodeBlob *cb = CodeCache::find_blob(entry);
+  if (cb) {
+    __ far_call(RuntimeAddress(entry));
+  } else {
+    __ call_VM_leaf(entry, 3);
+  }
+
+  __ bind(*stub->continuation());
+}
+
+void LIR_Assembler::emit_lock(LIR_OpLock* op) {
+  Register obj = op->obj_opr()->as_register();  // may not be an oop
+  Register hdr = op->hdr_opr()->as_register();
+  Register lock = op->lock_opr()->as_register();
+  if (!UseFastLocking) {
+    __ b(*op->stub()->entry());
+  } else if (op->code() == lir_lock) {
+    Register scratch = noreg;
+    if (UseBiasedLocking) {
+      scratch = op->scratch_opr()->as_register();
+    }
+    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
+    // add debug info for NullPointerException only if one is possible
+    int null_check_offset = __ lock_object(hdr, obj, lock, scratch, *op->stub()->entry());
+    if (op->info() != NULL) {
+      add_debug_info_for_null_check(null_check_offset, op->info());
+    }
+    // done
+  } else if (op->code() == lir_unlock) {
+    assert(BasicLock::displaced_header_offset_in_bytes() == 0, "lock_reg must point to the displaced header");
+    __ unlock_object(hdr, obj, lock, *op->stub()->entry());
+  } else {
+    Unimplemented();
+  }
+  __ bind(*op->stub()->continuation());
+}
+
+
+void LIR_Assembler::emit_profile_call(LIR_OpProfileCall* op) {
+  ciMethod* method = op->profiled_method();
+  int bci          = op->profiled_bci();
+  ciMethod* callee = op->profiled_callee();
+
+  // Update counter for all call types
+  ciMethodData* md = method->method_data_or_null();
+  assert(md != NULL, "Sanity");
+  ciProfileData* data = md->bci_to_data(bci);
+  assert(data != NULL && data->is_CounterData(), "need CounterData for calls");
+  assert(op->mdo()->is_single_cpu(),  "mdo must be allocated");
+  Register mdo  = op->mdo()->as_register();
+  __ mov_metadata(mdo, md->constant_encoding());
+  Address counter_addr(mdo, md->byte_offset_of_slot(data, CounterData::count_offset()));
+  // Perform additional virtual call profiling for invokevirtual and
+  // invokeinterface bytecodes
+  if (op->should_profile_receiver_type()) {
+    assert(op->recv()->is_single_cpu(), "recv must be allocated");
+    Register recv = op->recv()->as_register();
+    assert_different_registers(mdo, recv);
+    assert(data->is_VirtualCallData(), "need VirtualCallData for virtual calls");
+    ciKlass* known_klass = op->known_holder();
+    if (C1OptimizeVirtualCallProfiling && known_klass != NULL) {
+      // We know the type that will be seen at this call site; we can
+      // statically update the MethodData* rather than needing to do
+      // dynamic tests on the receiver type
+
+      // NOTE: we should probably put a lock around this search to
+      // avoid collisions by concurrent compilations
+      ciVirtualCallData* vc_data = (ciVirtualCallData*) data;
+      uint i;
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (known_klass->equals(receiver)) {
+          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
+          __ addptr(data_addr, DataLayout::counter_increment);
+          return;
+        }
+      }
+
+      // Receiver type not found in profile data; select an empty slot
+
+      // Note that this is less efficient than it should be because it
+      // always does a write to the receiver part of the
+      // VirtualCallData rather than just the first time
+      for (i = 0; i < VirtualCallData::row_limit(); i++) {
+        ciKlass* receiver = vc_data->receiver(i);
+        if (receiver == NULL) {
+          Address recv_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_offset(i)));
+          __ mov_metadata(rscratch1, known_klass->constant_encoding());
+          __ lea(rscratch2, recv_addr);
+          __ str(rscratch1, Address(rscratch2));
+          Address data_addr(mdo, md->byte_offset_of_slot(data, VirtualCallData::receiver_count_offset(i)));
+          __ addptr(data_addr, DataLayout::counter_increment);
+          return;
+        }
+      }
+    } else {
+      __ load_klass(recv, recv);
+      Label update_done;
+      type_profile_helper(mdo, md, data, recv, &update_done);
+      // Receiver did not match any saved receiver and there is no empty row for it.
+      // Increment total counter to indicate polymorphic case.
+      __ addptr(counter_addr, DataLayout::counter_increment);
+
+      __ bind(update_done);
+    }
+  } else {
+    // Static call
+    __ addptr(counter_addr, DataLayout::counter_increment);
+  }
+}
+
+
+void LIR_Assembler::emit_delay(LIR_OpDelay*) {
+  Unimplemented();
+}
+
+
+void LIR_Assembler::monitor_address(int monitor_no, LIR_Opr dst) {
+  __ lea(dst->as_register(), frame_map()->address_for_monitor_lock(monitor_no));
+}
+
+void LIR_Assembler::emit_updatecrc32(LIR_OpUpdateCRC32* op) {
+  assert(op->crc()->is_single_cpu(),  "crc must be register");
+  assert(op->val()->is_single_cpu(),  "byte value must be register");
+  assert(op->result_opr()->is_single_cpu(), "result must be register");
+  Register crc = op->crc()->as_register();
+  Register val = op->val()->as_register();
+  Register res = op->result_opr()->as_register();
+
+  assert_different_registers(val, crc, res);
+  __ lea(res, ExternalAddress(StubRoutines::crc_table_addr()));
+
+  __ inv(crc, crc);
+  __ update_byte_crc32(crc, val, res);
+  __ inv(res, crc);
+}
+
+void LIR_Assembler::emit_profile_type(LIR_OpProfileType* op) {
+  COMMENT("emit_profile_type {");
+  Register obj = op->obj()->as_register();
+  Register tmp = op->tmp()->as_pointer_register();
+  Address mdo_addr = as_Address(op->mdp()->as_address_ptr(), noreg, Address::IDT_INT);
+  ciKlass* exact_klass = op->exact_klass();
+  intptr_t current_klass = op->current_klass();
+  bool not_null = op->not_null();
+  bool no_conflict = op->no_conflict();
+
+  Label update, next, none;
+
+  bool do_null = !not_null;
+  bool exact_klass_set = exact_klass != NULL && ciTypeEntries::valid_ciklass(current_klass) == exact_klass;
+  bool do_update = !TypeEntries::is_type_unknown(current_klass) && !exact_klass_set;
+
+  assert(do_null || do_update, "why are we here?");
+  assert(!TypeEntries::was_null_seen(current_klass) || do_update, "why are we here?");
+  assert(mdo_addr.base() != rscratch1, "wrong register");
+
+  __ verify_oop(obj);
+
+  if (tmp != obj) {
+    __ mov(tmp, obj);
+  }
+  if (do_null) {
+    __ cbnz(tmp, update);
+    if (!TypeEntries::was_null_seen(current_klass)) {
+      __ ldr(rscratch2, mdo_addr);
+      __ orr(rscratch2, rscratch2, TypeEntries::null_seen);
+      __ str(rscratch2, mdo_addr);
+    }
+    if (do_update) {
+#ifndef ASSERT
+      __ b(next);
+    }
+#else
+      __ b(next);
+    }
+  } else {
+    __ cbnz(tmp, update);
+    __ stop("unexpected null obj");
+#endif
+  }
+
+  __ bind(update);
+
+  if (do_update) {
+#ifdef ASSERT
+    if (exact_klass != NULL) {
+      Label ok;
+      __ load_klass(tmp, tmp);
+      __ mov_metadata(rscratch1, exact_klass->constant_encoding());
+      __ eor(rscratch1, tmp, rscratch1);
+      __ cbz(rscratch1, ok);
+      __ stop("exact klass and actual klass differ");
+      __ bind(ok);
+    }
+#endif
+    if (!no_conflict) {
+      if (exact_klass == NULL || TypeEntries::is_type_none(current_klass)) {
+        if (exact_klass != NULL) {
+          __ mov_metadata(tmp, exact_klass->constant_encoding());
+        } else {
+          __ load_klass(tmp, tmp);
+        }
+
+        __ ldr(rscratch2, mdo_addr);
+        __ eor(tmp, tmp, rscratch2);
+        __ bics(rscratch1, tmp, ~TypeEntries::type_klass_mask);
+        // klass seen before, nothing to do. The unknown bit may have been
+        // set already but no need to check.
+        __ b(next, Assembler::EQ);
+
+        __ ands(rscratch1, tmp, TypeEntries::type_unknown);
+        __ b(next, Assembler::NE); // already unknown. Nothing to do anymore.
+
+        if (TypeEntries::is_type_none(current_klass)) {
+          __ cbz(rscratch2, none);
+          __ cmp(rscratch2, TypeEntries::null_seen);
+          __ b(none, Assembler::EQ);
+          // There is a chance that the checks above (re-reading profiling
+          // data from memory) fail if another thread has just set the
+          // profiling to this obj's klass
+          __ dmb(Assembler::ISH);
+          __ ldr(rscratch2, mdo_addr);
+          __ eor(tmp, tmp, rscratch2);
+          __ bics(rscratch1, tmp, ~TypeEntries::type_klass_mask);
+          __ b(next, Assembler::EQ);
+        }
+      } else {
+        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
+               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "conflict only");
+
+        __ ldr(tmp, mdo_addr);
+        __ ands(rscratch1, tmp, TypeEntries::type_unknown);
+        __ b(next, Assembler::NE); // already unknown. Nothing to do anymore.
+      }
+
+      // different than before. Cannot keep accurate profile.
+      __ ldr(rscratch2, mdo_addr);
+      __ orr(rscratch2, rscratch2, TypeEntries::type_unknown);
+      __ str(rscratch2, mdo_addr);
+
+      if (TypeEntries::is_type_none(current_klass)) {
+        __ b(next);
+
+        __ bind(none);
+        // first time here. Set profile type.
+        __ str(tmp, mdo_addr);
+      }
+    } else {
+      // There's a single possible klass at this profile point
+      assert(exact_klass != NULL, "should be");
+      if (TypeEntries::is_type_none(current_klass)) {
+        __ mov_metadata(tmp, exact_klass->constant_encoding());
+        __ ldr(rscratch2, mdo_addr);
+        __ eor(tmp, tmp, rscratch2);
+        __ bics(rscratch1, tmp, ~TypeEntries::type_klass_mask);
+        __ b(next, Assembler::EQ);
+#ifdef ASSERT
+        {
+          Label ok;
+          __ ldr(rscratch1, mdo_addr);
+          __ cbz(rscratch1, ok);
+          __ cmp(rscratch1, TypeEntries::null_seen);
+          __ b(ok, Assembler::EQ);
+          // may have been set by another thread
+          __ dmb(Assembler::ISH);
+          __ mov_metadata(rscratch1, exact_klass->constant_encoding());
+          __ ldr(rscratch2, mdo_addr);
+          __ eor(rscratch2, rscratch1, rscratch2);
+          __ bics(rscratch2, rscratch2, ~TypeEntries::type_mask);
+          __ b(ok, Assembler::EQ);
+
+          __ stop("unexpected profiling mismatch");
+          __ bind(ok);
+        }
+#endif
+        // first time here. Set profile type.
+        __ ldr(tmp, mdo_addr);
+      } else {
+        assert(ciTypeEntries::valid_ciklass(current_klass) != NULL &&
+               ciTypeEntries::valid_ciklass(current_klass) != exact_klass, "inconsistent");
+
+        __ ldr(tmp, mdo_addr);
+        __ ands(rscratch1, tmp, TypeEntries::type_unknown);
+        __ b(next, Assembler::NE); // already unknown. Nothing to do anymore.
+
+        __ orr(tmp, tmp, TypeEntries::type_unknown);
+        __ str(tmp, mdo_addr);
+        // FIXME: Write barrier needed here?
+      }
+    }
+
+    __ bind(next);
+  }
+  COMMENT("} emit_profile_type");
+}
+
+
+void LIR_Assembler::align_backward_branch_target() {
+}
+
+
+void LIR_Assembler::negate(LIR_Opr left, LIR_Opr dest) {
+  if (left->is_single_cpu()) {
+    assert(left->type() != T_FLOAT, "expect integer type");
+    assert(dest->type() != T_FLOAT, "expect integer type");
+    assert(dest->is_single_cpu(), "expect single result reg");
+    __ neg(dest->as_register(), left->as_register());
+  } else if (left->is_double_cpu()) {
+    assert(left->type() != T_DOUBLE, "expect integer type");
+    assert(dest->type() != T_DOUBLE, "expect integer type");
+    assert(dest->is_double_cpu(), "expect double result reg");
+    const Register l_lo = left->as_register_lo();
+    Register l_hi = left->as_register_hi();
+    check_register_collision(dest->as_register_lo(), &l_hi);
+    __ rsbs(dest->as_register_lo(), l_lo, 0);
+    __ rsc(dest->as_register_hi(), l_hi, 0);
+  } else if (left->is_single_fpu()) {
+    assert(dest->is_single_fpu(), "expect single float result reg");
+    __ vneg_f32(dest->as_float_reg(), left->as_float_reg());
+  } else if (left->is_double_fpu()) {
+    assert(left->is_double_fpu(), "expect double float operand reg");
+    assert(dest->is_double_fpu(), "expect double float result reg");
+    __ vneg_f64(dest->as_double_reg(), left->as_double_reg());
+  } else {
+      ShouldNotReachHere();
+  }
+}
+
+
+void LIR_Assembler::leal(LIR_Opr addr, LIR_Opr dest, LIR_PatchCode patch_code, CodeEmitInfo* info) {
+  assert(patch_code == lir_patch_none, "Patch code not supported");
+  __ lea(dest->as_register(), as_Address(addr->as_address_ptr(), noreg, Address::IDT_LEA));
+}
+
+
+void LIR_Assembler::rt_call(LIR_Opr result, address dest, const LIR_OprList* args, LIR_Opr tmp, CodeEmitInfo* info) {
+  assert(!tmp->is_valid(), "don't need temporary");
+  CodeBlob *cb = CodeCache::find_blob(dest);
+  if (cb) {
+    __ far_call(RuntimeAddress(dest));
+  } else {
+    __ lea(rscratch1, RuntimeAddress(dest));
+    __ bl(rscratch1);
+  }
+  if (info != NULL) {
+    add_call_info_here(info);
+  }
+  __ maybe_isb();
+}
+
+void LIR_Assembler::volatile_move_op(LIR_Opr src, LIR_Opr dest, BasicType type, CodeEmitInfo* info) {
+  if (type == T_LONG || type == T_DOUBLE) {
+    const LIR_Opr long_val = FrameMap::long0_opr;
+
+    int null_check_offset = -1;
+
+    if (src->is_register() && dest->is_address()) {
+      // long1 reserved as temp by LinearScan::pd_add_temps
+      const LIR_Opr long_tmp = FrameMap::long1_opr;
+      __ lea(rscratch1, as_Address_lo(dest->as_address_ptr(), Address::IDT_LEA));
+
+
+      if (src->is_double_fpu()) {
+        assert(type == T_DOUBLE, "invalid register allocation");
+        // long0 reserved as temp by LinearScan::pd_add_temps
+        __ vmov_f64(long_val->as_register_lo(), long_val->as_register_hi(), src->as_double_reg());
+      } else {
+        assert(type == T_LONG && src->is_same_register(long_val), "T_LONG src should be in long0 (by LIRGenerator)");
+      }
+
+      null_check_offset = __ offset();
+      __ atomic_strd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1,
+          long_tmp->as_register_lo(), long_tmp->as_register_hi());
+
+    } else if (src->is_address() && dest->is_register()) {
+      __ lea(rscratch1, as_Address_lo(src->as_address_ptr(), Address::IDT_LEA));
+
+      null_check_offset = __ offset();
+      __ atomic_ldrd(long_val->as_register_lo(), long_val->as_register_hi(), rscratch1);
+
+      if (dest->is_double_fpu()) {
+        __ vmov_f64(dest->as_double_reg(), long_val->as_register_lo(), long_val->as_register_hi());
+      } else {
+        assert(type != T_LONG || dest->is_same_register(long_val), "T_LONG dest should be in long0 (by LIRGenerator)");
+      }
+    } else {
+      Unimplemented();
+    }
+
+    if (info != NULL) {
+      add_debug_info_for_null_check(null_check_offset, info);
+    }
+
+  } else {
+    move_op(src, dest, type, lir_patch_none, info,
+            /*pop_fpu_stack*/false, /*unaligned*/false, /*wide*/false);
+  }
+}
+
+#ifdef ASSERT
+// emit run-time assertion
+void LIR_Assembler::emit_assert(LIR_OpAssert* op) {
+  assert(op->code() == lir_assert, "must be");
+
+  if (op->in_opr1()->is_valid()) {
+    assert(op->in_opr2()->is_valid(), "both operands must be valid");
+    comp_op(op->condition(), op->in_opr1(), op->in_opr2(), op);
+  } else {
+    assert(op->in_opr2()->is_illegal(), "both operands must be illegal");
+    assert(op->condition() == lir_cond_always, "no other conditions allowed");
+  }
+
+  Label ok;
+  if (op->condition() != lir_cond_always) {
+    Assembler::Condition acond = Assembler::AL;
+    switch (op->condition()) {
+      case lir_cond_equal:        acond = Assembler::EQ;  break;
+      case lir_cond_notEqual:     acond = Assembler::NE;  break;
+      case lir_cond_less:         acond = Assembler::LT;  break;
+      case lir_cond_greaterEqual: acond = Assembler::GE;  break;
+      case lir_cond_lessEqual:    acond = Assembler::LE;  break;
+      case lir_cond_greater:      acond = Assembler::GT;  break;
+      case lir_cond_belowEqual:   acond = Assembler::LS;  break;
+      case lir_cond_aboveEqual:   acond = Assembler::HS;  break;
+      default:                    ShouldNotReachHere();
+    }
+    if (op->in_opr1()->type() == T_LONG) {
+      // a special trick here to be able to effectively compare jlongs
+      // for the lessEqual and greater conditions the jlong operands are swapped
+      // during comparison and hence should use mirror condition in conditional
+      // instruction
+      // see LIR_Assembler::comp_op and LIR_Assembler::cmove
+      switch (op->condition()) {
+        case lir_cond_lessEqual:    acond = Assembler::GE;  break;
+        case lir_cond_greater:      acond = Assembler::LT;  break;
+      }
+    }
+    __ b(ok, acond);
+  }
+  if (op->halt()) {
+    const char* str = __ code_string(op->msg());
+    __ stop(str);
+  } else {
+    breakpoint();
+  }
+  __ bind(ok);
+}
+#endif
+
+#ifndef PRODUCT
+#define COMMENT(x)   do { __ block_comment(x); } while (0)
+#else
+#define COMMENT(x)
+#endif
+
+void LIR_Assembler::membar() {
+  COMMENT("membar");
+  __ membar(MacroAssembler::AnyAny);
+}
+
+void LIR_Assembler::membar_acquire() {
+  __ membar(Assembler::LoadLoad|Assembler::LoadStore);
+}
+
+void LIR_Assembler::membar_release() {
+  __ membar(Assembler::LoadStore|Assembler::StoreStore);
+}
+
+void LIR_Assembler::membar_loadload() {
+  __ membar(Assembler::LoadLoad);
+}
+
+void LIR_Assembler::membar_storestore() {
+  __ membar(MacroAssembler::StoreStore);
+}
+
+void LIR_Assembler::membar_loadstore() { __ membar(MacroAssembler::LoadStore); }
+
+void LIR_Assembler::membar_storeload() { __ membar(MacroAssembler::StoreLoad); }
+
+void LIR_Assembler::on_spin_wait() {
+  Unimplemented();
+}
+
+void LIR_Assembler::get_thread(LIR_Opr result_reg) {
+  __ mov(result_reg->as_register(), rthread);
+}
+
+
+void LIR_Assembler::peephole(LIR_List *lir) {
+#if 0
+  if (tableswitch_count >= max_tableswitches)
+    return;
+
+  /*
+    This finite-state automaton recognizes sequences of compare-and-
+    branch instructions.  We will turn them into a tableswitch.  You
+    could argue that C1 really shouldn't be doing this sort of
+    optimization, but without it the code is really horrible.
+  */
+
+  enum { start_s, cmp1_s, beq_s, cmp_s } state;
+  int first_key, last_key = -2147483648;
+  int next_key = 0;
+  int start_insn = -1;
+  int last_insn = -1;
+  Register reg = noreg;
+  LIR_Opr reg_opr;
+  state = start_s;
+
+  LIR_OpList* inst = lir->instructions_list();
+  for (int i = 0; i < inst->length(); i++) {
+    LIR_Op* op = inst->at(i);
+    switch (state) {
+    case start_s:
+      first_key = -1;
+      start_insn = i;
+      switch (op->code()) {
+      case lir_cmp:
+        LIR_Opr opr1 = op->as_Op2()->in_opr1();
+        LIR_Opr opr2 = op->as_Op2()->in_opr2();
+        if (opr1->is_cpu_register() && opr1->is_single_cpu()
+            && opr2->is_constant()
+            && opr2->type() == T_INT) {
+          reg_opr = opr1;
+          reg = opr1->as_register();
+          first_key = opr2->as_constant_ptr()->as_jint();
+          next_key = first_key + 1;
+          state = cmp_s;
+          goto next_state;
+        }
+        break;
+      }
+      break;
+    case cmp_s:
+      switch (op->code()) {
+      case lir_branch:
+        if (op->as_OpBranch()->cond() == lir_cond_equal) {
+          state = beq_s;
+          last_insn = i;
+          goto next_state;
+        }
+      }
+      state = start_s;
+      break;
+    case beq_s:
+      switch (op->code()) {
+      case lir_cmp: {
+        LIR_Opr opr1 = op->as_Op2()->in_opr1();
+        LIR_Opr opr2 = op->as_Op2()->in_opr2();
+        if (opr1->is_cpu_register() && opr1->is_single_cpu()
+            && opr1->as_register() == reg
+            && opr2->is_constant()
+            && opr2->type() == T_INT
+            && opr2->as_constant_ptr()->as_jint() == next_key) {
+          last_key = next_key;
+          next_key++;
+          state = cmp_s;
+          goto next_state;
+        }
+      }
+      }
+      last_key = next_key;
+      state = start_s;
+      break;
+    default:
+      assert(false, "impossible state");
+    }
+    if (state == start_s) {
+      if (first_key < last_key - 5L && reg != noreg) {
+        {
+          // printf("found run register %d starting at insn %d low value %d high value %d\n",
+          //        reg->encoding(),
+          //        start_insn, first_key, last_key);
+          //   for (int i = 0; i < inst->length(); i++) {
+          //     inst->at(i)->print();
+          //     tty->print("\n");
+          //   }
+          //   tty->print("\n");
+        }
+
+        struct tableswitch *sw = &switches[tableswitch_count];
+        sw->_insn_index = start_insn, sw->_first_key = first_key,
+          sw->_last_key = last_key, sw->_reg = reg;
+        inst->insert_before(last_insn + 1, new LIR_OpLabel(&sw->_after));
+        {
+          // Insert the new table of branches
+          int offset = last_insn;
+          for (int n = first_key; n < last_key; n++) {
+            inst->insert_before
+              (last_insn + 1,
+               new LIR_OpBranch(lir_cond_always, T_ILLEGAL,
+                                inst->at(offset)->as_OpBranch()->label()));
+            offset -= 2, i++;
+          }
+        }
+        // Delete all the old compare-and-branch instructions
+        for (int n = first_key; n < last_key; n++) {
+          inst->remove_at(start_insn);
+          inst->remove_at(start_insn);
+        }
+        // Insert the tableswitch instruction
+        inst->insert_before(start_insn,
+                            new LIR_Op2(lir_cmp, lir_cond_always,
+                                        LIR_OprFact::intConst(tableswitch_count),
+                                        reg_opr));
+        inst->insert_before(start_insn + 1, new LIR_OpLabel(&sw->_branches));
+        tableswitch_count++;
+      }
+      reg = noreg;
+      last_key = -2147483648;
+    }
+  next_state:
+    ;
+  }
+#endif
+}
+
+void LIR_Assembler::atomic_op(LIR_Code code, LIR_Opr src, LIR_Opr data, LIR_Opr dest, LIR_Opr tmp_op) {
+  BasicType type = src->type();
+  Address addr = as_Address(src->as_address_ptr(), Address::toInsnDataType(type));
+
+  bool is_long = false;
+
+  switch(type) {
+  case T_INT:
+  case T_OBJECT:
+  case T_ARRAY:
+    break;
+  case T_LONG:
+    is_long = true;
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  switch (code) {
+  case lir_xadd:
+    {
+      Register tmp = tmp_op->as_register();
+      Register dst = as_reg(dest);
+      Label again;
+      __ lea(tmp, addr);
+      __ bind(again);
+      if(is_long) {
+          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
+          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");
+          _masm->ldrexd(dst, tmp);
+      } else {
+          _masm->ldrex(dst, tmp);
+      }
+      arith_op(lir_add, dest, data, dest, NULL, false);
+      if (is_long) {
+        _masm->strexd(rscratch1, dst, tmp);
+      } else {
+        _masm->strex(rscratch1, dst, tmp);
+      }
+      __ cbnz(rscratch1, again);
+      arith_op(lir_sub, dest, data, dest, NULL, false);
+      break;
+    }
+  case lir_xchg:
+    {
+      Register tmp = tmp_op->as_register();
+      Register obj = as_reg(data);
+      Register dst = as_reg(dest);
+      assert_different_registers(obj, addr.base(), tmp, rscratch1, dst);
+      Label again;
+      __ lea(tmp, addr);
+      __ bind(again);
+      if(is_long) {
+          assert(dest->as_register_lo()->successor() == dest->as_register_hi(), "must be contiguous");
+          assert((dest->as_register_lo()->encoding() & 1) == 0, "must be even");
+
+          assert(data->is_double_cpu(), "should be double register");
+          assert(data->as_register_lo()->successor() == data->as_register_hi(), "must be contiguous");
+          assert((data->as_register_lo()->encoding() & 1) == 0, "must be even");
+
+          _masm->ldrexd(dst, tmp);
+          _masm->strexd(rscratch1, obj, tmp);
+      } else {
+         _masm->ldrex(dst, tmp);
+         _masm->strex(rscratch1, obj, tmp);
+      }
+      __ cbnz(rscratch1, again);
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  __ membar(__ AnyAny);
+}
+
+void LIR_Assembler::check_register_collision(Register d, Register *s1, Register *s2, Register tmp) {
+  // use a temp if any of the registers used as a source of operation
+  // collide with result register of the prerequisite operation
+  if (d == *s1) {
+    __ mov(tmp, d);
+    *s1 = tmp;
+  } else if (s2 && d == *s2) {
+    __ mov(tmp, d);
+    *s2 = tmp;
+  }
+}
+
+#undef __
--- /dev/null	2018-09-25 19:24:24.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LIRAssembler_aarch32.hpp	2018-09-25 19:24:24.000000000 +0300
@@ -0,0 +1,93 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_LIRASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_LIRASSEMBLER_AARCH32_HPP
+
+#include "assembler_aarch32.hpp"
+
+// ArrayCopyStub needs access to bailout
+friend class ArrayCopyStub;
+
+ private:
+
+  int array_element_size(BasicType type) const;
+
+  // helper functions which checks for overflow and sets bailout if it
+  // occurs.  Always returns a valid embeddable pointer but in the
+  // bailout case the pointer won't be to unique storage.
+  address float_constant(float f);
+  address double_constant(double d);
+
+  Address as_Address(LIR_Address* addr, Register tmp, Address::InsnDataType type);
+  Address as_Address_hi(LIR_Address* addr, Address::InsnDataType type);
+  Address as_Address_lo(LIR_Address* addr, Address::InsnDataType type);
+
+  Address as_Address(LIR_Address* addr, Address::InsnDataType type) {
+    return as_Address(addr, rscratch1, type);
+  }
+
+
+  // Record the type of the receiver in ReceiverTypeData
+  void type_profile_helper(Register mdo,
+                           ciMethodData *md, ciProfileData *data,
+                           Register recv, Label* update_done);
+  void add_debug_info_for_branch(address adr, CodeEmitInfo* info);
+
+  void casw(Register addr, Register newval, Register cmpval, Register result);
+  void casl(Register addr, Register newval_lo, Register newval_hi,
+            Register cmpval_lo,  Register cmpval_hi,
+            Register tmp_lo, Register tmp_hi, Register result);
+
+  FloatRegister as_float_reg(LIR_Opr doubleReg);
+
+  static const int max_tableswitches = 20;
+  struct tableswitch switches[max_tableswitches];
+  int tableswitch_count;
+
+  void init() { tableswitch_count = 0; }
+
+  void deoptimize_trap(CodeEmitInfo *info);
+
+  enum {
+    _call_stub_size = 12 * NativeInstruction::arm_insn_sz,
+    _call_aot_stub_size = 0,
+    _exception_handler_size = DEBUG_ONLY(1*K) NOT_DEBUG(175),
+    _deopt_handler_size = 7 * NativeInstruction::arm_insn_sz
+  };
+
+  // remap input register (*s1 or *s2) to a temp one if it is at the same time
+  // used a result register (d) of a preceeding operation (so otherwise its
+  // contents gets effectively corrupt)
+  void check_register_collision(Register d, Register *s1, Register *s2 = NULL, Register tmp = rscratch1);
+
+public:
+
+  void store_parameter(Register r, int offset_from_sp_in_words);
+  void store_parameter(jint c,     int offset_from_sp_in_words);
+  void store_parameter(jobject c,  int offset_from_sp_in_words);
+
+#endif // CPU_AARCH32_VM_C1_LIRASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:25.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LIRGenerator_aarch32.cpp	2018-09-25 19:24:25.000000000 +0300
@@ -0,0 +1,1740 @@
+/*
+ * Copyright (c) 2005, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "c1/c1_Compilation.hpp"
+#include "c1/c1_FrameMap.hpp"
+#include "c1/c1_Instruction.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_LIRGenerator.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "c1/c1_ValueStack.hpp"
+#include "ci/ciArray.hpp"
+#include "ci/ciObjArrayKlass.hpp"
+#include "ci/ciTypeArrayKlass.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#include "vm_version_aarch32.hpp"
+
+#ifdef ASSERT
+#define __ gen()->lir(__FILE__, __LINE__)->
+#else
+#define __ gen()->lir()->
+#endif
+
+// Item will be loaded into a byte register; Intel only
+void LIRItem::load_byte_item() {
+  load_item();
+}
+
+
+void LIRItem::load_nonconstant() {
+  LIR_Opr r = value()->operand();
+  if (r->is_constant()) {
+    _result = r;
+  } else {
+    load_item();
+  }
+}
+
+//--------------------------------------------------------------
+//               LIRGenerator
+//--------------------------------------------------------------
+
+
+LIR_Opr LIRGenerator::exceptionOopOpr() { return FrameMap::r0_oop_opr; }
+LIR_Opr LIRGenerator::exceptionPcOpr()  { return FrameMap::r3_opr; }
+LIR_Opr LIRGenerator::divInOpr()        { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::divOutOpr()       { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::remOutOpr()       { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::shiftCountOpr()   { Unimplemented(); return LIR_OprFact::illegalOpr; }
+LIR_Opr LIRGenerator::syncLockOpr()     { return new_register(T_INT); }
+LIR_Opr LIRGenerator::syncTempOpr()     { return FrameMap::r0_opr; }
+LIR_Opr LIRGenerator::getThreadTemp()   { return LIR_OprFact::illegalOpr; }
+
+
+LIR_Opr LIRGenerator::java_result_register_for(ValueType* type, bool callee) {
+  LIR_Opr opr;
+  switch (type->tag()) {
+    case floatTag:
+        if(hasFPU()) {
+            opr = FrameMap::fpu0_float_opr;  break;;
+        }
+    case doubleTag:
+        if(hasFPU()) {
+            opr = FrameMap::fpu0_double_opr;  break;
+        }
+    default: opr = result_register_for(type, callee);
+  }
+  return opr;
+}
+LIR_Opr LIRGenerator::result_register_for(ValueType* type, bool callee) {
+  LIR_Opr opr;
+  switch (type->tag()) {
+    case floatTag:
+#ifdef HARD_FLOAT_CC
+        opr = FrameMap::fpu0_float_opr;  break;
+#endif
+    case intTag:     opr = FrameMap::r0_opr;          break;
+    case objectTag:  opr = FrameMap::r0_oop_opr;      break;
+    case doubleTag:
+#ifdef HARD_FLOAT_CC
+        opr = FrameMap::fpu0_double_opr;  break;
+#endif
+    case longTag:    opr = FrameMap::long0_opr;        break;
+
+    case addressTag:
+    default: ShouldNotReachHere(); return LIR_OprFact::illegalOpr;
+  }
+#ifndef HARD_FLOAT_CC
+  assert(type->is_float_kind() || opr->type_field() == as_OprType(as_BasicType(type)), "type mismatch");
+#else
+  assert(opr->type_field() == as_OprType(as_BasicType(type)), "type mismatch");
+#endif
+  return opr;
+}
+
+
+LIR_Opr LIRGenerator::rlock_byte(BasicType type) {
+  LIR_Opr reg = new_register(T_INT);
+  set_vreg_flag(reg, LIRGenerator::byte_reg);
+  return reg;
+}
+
+
+//--------- loading items into registers --------------------------------
+
+
+bool LIRGenerator::can_store_as_constant(Value v, BasicType type) const {
+  if (v->type()->as_IntConstant() != NULL) {
+    return v->type()->as_IntConstant()->value() == 0L;
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return v->type()->as_LongConstant()->value() == 0L;
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else {
+    return false;
+  }
+}
+
+bool LIRGenerator::can_inline_as_constant(Value v) const {
+  if (v->type()->as_IntConstant() != NULL) {
+    return Assembler::operand_valid_for_add_sub_immediate(v->type()->as_IntConstant()->value());
+  } else if (v->type()->as_LongConstant() != NULL) {
+    return Assembler::operand_valid_for_add_sub_immediate(v->type()->as_LongConstant()->value());
+  } else if (v->type()->as_ObjectConstant() != NULL) {
+    return v->type()->as_ObjectConstant()->value()->is_null_object();
+  } else {
+    return false;
+  }
+}
+
+
+bool LIRGenerator::can_inline_as_constant(LIR_Const* c) const {
+  switch (c->type()) {
+  case T_BOOLEAN:
+  case T_CHAR:
+  case T_BYTE:
+  case T_SHORT:
+  case T_INT:
+    return Assembler::operand_valid_for_add_sub_immediate(c->as_jint());
+  case T_LONG:
+    return Assembler::operand_valid_for_add_sub_immediate(c->as_jlong());
+  case T_OBJECT:
+    return c->as_jobject() == (jobject) NULL;
+  case T_METADATA:
+    return c->as_metadata() == (Metadata*) NULL;
+  case T_FLOAT:
+    if( hasFPU()) {
+        return Assembler::operand_valid_for_float_immediate(c->as_jfloat());
+    } else {
+       return Assembler::operand_valid_for_add_sub_immediate(c->as_jint());
+    }
+  case T_DOUBLE:
+    if( hasFPU()) {
+        return Assembler::operand_valid_for_float_immediate(c->as_jdouble());
+    } else {
+        return Assembler::operand_valid_for_add_sub_immediate(c->as_jlong());
+    }
+  }
+  return false;
+}
+
+LIR_Opr LIRGenerator::safepoint_poll_register() {
+  return LIR_OprFact::illegalOpr;
+}
+
+LIR_Address* LIRGenerator::generate_address(LIR_Opr base, LIR_Opr index,
+                                            int shift, int disp, BasicType type) {
+  const Address::InsnDataType insn_type = Address::toInsnDataType(type);
+  assert(base->is_register(), "must be");
+
+  // accumulate fixed displacements
+  if (index->is_constant()) {
+    assert(index->as_constant_ptr()->type() == T_INT, "assumed");
+    disp += index->as_constant_ptr()->as_jint() << shift;
+    index = LIR_OprFact::illegalOpr;
+    shift = 0;
+  }
+
+  // aarch32 cannot handle natively both index and offset at the same time
+  // need to calculate effective value
+  if (index->is_register()) {
+    if ((disp != 0) &&
+        Address::shift_ok_for_index(lsl(shift), insn_type) &&
+        Assembler::operand_valid_for_add_sub_immediate(disp)) {
+      // add tmp, base, disp
+      // ldr r, [tmp, index, LSL #shift ]
+      LIR_Opr tmp = new_pointer_register();
+      __ add(base, LIR_OprFact::intptrConst(disp), tmp);
+      base = tmp;
+      disp = 0;
+    } else {
+      assert(shift <= (int) LIR_Address::times_8, "no large shift could be here");
+      // add tmp, base, index, LSL #shift
+      // ...
+      // ldr r, [tmp, ...]
+      LIR_Opr tmp = new_pointer_register();
+      __ leal(LIR_OprFact::address(new LIR_Address(base, index, (LIR_Address::Scale) shift, 0, type)), tmp);
+      base = tmp;
+      index = LIR_OprFact::illegalOpr;
+      shift = 0;
+    }
+  }
+
+  assert(!index->is_register() || (disp == 0), "should be");
+
+  if (!Address::offset_ok_for_immed(disp, insn_type)) {
+    assert(!index->is_valid(), "should be");
+    // here index should be illegal so we can replace it with the displacement
+    // loaded into a register
+    // mov tmp, disp
+    // ldr r, [base, tmp]
+    index = new_pointer_register();
+    __ move(LIR_OprFact::intptrConst(disp), index);
+    disp = 0;
+  }
+
+  assert(Address::offset_ok_for_immed(disp, Address::toInsnDataType(type)), "must be");
+  return new LIR_Address(base, index, (LIR_Address::Scale) shift, disp, type);
+}
+
+LIR_Address* LIRGenerator::emit_array_address(LIR_Opr array_opr, LIR_Opr index_opr,
+                                              BasicType type) {
+  int offset_in_bytes = arrayOopDesc::base_offset_in_bytes(type);
+  int elem_size = type2aelembytes(type);
+  int shift = exact_log2(elem_size);
+
+  LIR_Address* addr = generate_address(array_opr, index_opr, shift, offset_in_bytes, type);
+
+  return addr;
+}
+
+LIR_Opr LIRGenerator::load_immediate(int x, BasicType type) {
+  LIR_Opr r;
+  if (type == T_LONG) {
+    r = LIR_OprFact::longConst(x);
+    if (!Assembler::operand_valid_for_logical_immediate(false, x)) {
+      LIR_Opr tmp = new_register(type);
+      __ move(r, tmp);
+      return tmp;
+    }
+  } else if (type == T_INT) {
+    r = LIR_OprFact::intConst(x);
+    if (!Assembler::operand_valid_for_logical_immediate(true, x)) {
+      // This is all rather nasty.  We don't know whether our constant
+      // is required for a logical or an arithmetic operation, wo we
+      // don't know what the range of valid values is!!
+      LIR_Opr tmp = new_register(type);
+      __ move(r, tmp);
+      return tmp;
+    }
+  } else {
+    ShouldNotReachHere();
+    r = LIR_OprFact::illegalOpr; // unreachable
+  }
+  return r;
+}
+
+
+
+void LIRGenerator::increment_counter(address counter, BasicType type, int step) {
+  LIR_Opr pointer = new_pointer_register();
+  __ move(LIR_OprFact::intptrConst(counter), pointer);
+  LIR_Address* addr = new LIR_Address(pointer, type);
+  increment_counter(addr, step);
+}
+
+
+void LIRGenerator::increment_counter(LIR_Address* addr, int step) {
+  LIR_Opr imm = NULL;
+  switch(addr->type()) {
+  case T_INT:
+    imm = LIR_OprFact::intConst(step);
+    break;
+  case T_LONG:
+    imm = LIR_OprFact::longConst(step);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  LIR_Opr reg = new_register(addr->type());
+  __ load(addr, reg);
+  __ add(reg, imm, reg);
+  __ store(reg, addr);
+}
+
+void LIRGenerator::cmp_mem_int(LIR_Condition condition, LIR_Opr base, int disp, int c, CodeEmitInfo* info) {
+  LIR_Opr reg = new_register(T_INT);
+  __ load(generate_address(base, disp, T_INT), reg, info);
+  __ cmp(condition, reg, LIR_OprFact::intConst(c));
+}
+
+void LIRGenerator::cmp_reg_mem(LIR_Condition condition, LIR_Opr reg, LIR_Opr base, int disp, BasicType type, CodeEmitInfo* info) {
+  LIR_Opr reg1 = new_register(T_INT);
+  __ load(generate_address(base, disp, type), reg1, info);
+  __ cmp(condition, reg, reg1);
+}
+
+
+bool LIRGenerator::strength_reduce_multiply(LIR_Opr left, int c, LIR_Opr result, LIR_Opr tmp) {
+
+  if (is_power_of_2(c - 1)) {
+    __ shift_left(left, exact_log2(c - 1), tmp);
+    __ add(tmp, left, result);
+    return true;
+  } else if (is_power_of_2(c + 1)) {
+    __ shift_left(left, exact_log2(c + 1), tmp);
+    __ sub(tmp, left, result);
+    return true;
+  } else {
+    return false;
+  }
+}
+
+void LIRGenerator::store_stack_parameter (LIR_Opr item, ByteSize offset_from_sp) {
+  BasicType type = item->type();
+  __ store(item, new LIR_Address(FrameMap::sp_opr, in_bytes(offset_from_sp), type));
+}
+
+void LIRGenerator::array_store_check(LIR_Opr value, LIR_Opr array, CodeEmitInfo* store_check_info, ciMethod* profiled_method, int profiled_bci) {
+    LIR_Opr tmp1 = new_register(objectType);
+    LIR_Opr tmp2 = new_register(objectType);
+    LIR_Opr tmp3 = new_register(objectType);
+    __ store_check(value, array, tmp1, tmp2, tmp3, store_check_info, profiled_method, profiled_bci);
+}
+
+//----------------------------------------------------------------------
+//             visitor functions
+//----------------------------------------------------------------------
+
+void LIRGenerator::do_MonitorEnter(MonitorEnter* x) {
+  assert(x->is_pinned(),"");
+  LIRItem obj(x->obj(), this);
+  obj.load_item();
+
+  set_no_result(x);
+
+  // "lock" stores the address of the monitor stack slot, so this is not an oop
+  LIR_Opr lock = new_register(T_INT);
+  // Need a scratch register for biased locking
+  LIR_Opr scratch = LIR_OprFact::illegalOpr;
+  if (UseBiasedLocking) {
+    scratch = new_register(T_INT);
+  }
+
+  CodeEmitInfo* info_for_exception = NULL;
+  if (x->needs_null_check()) {
+    info_for_exception = state_for(x);
+  }
+  // this CodeEmitInfo must not have the xhandlers because here the
+  // object is already locked (xhandlers expect object to be unlocked)
+  CodeEmitInfo* info = state_for(x, x->state(), true);
+  monitor_enter(obj.result(), lock, syncTempOpr(), scratch,
+                        x->monitor_no(), info_for_exception, info);
+}
+
+
+void LIRGenerator::do_MonitorExit(MonitorExit* x) {
+  assert(x->is_pinned(),"");
+
+  LIRItem obj(x->obj(), this);
+  obj.dont_load_item();
+
+  LIR_Opr lock = new_register(T_INT);
+  LIR_Opr obj_temp = new_register(T_INT);
+  set_no_result(x);
+  monitor_exit(obj_temp, lock, syncTempOpr(), LIR_OprFact::illegalOpr, x->monitor_no());
+}
+
+
+void LIRGenerator::do_NegateOp(NegateOp* x) {
+#ifdef __SOFTFP__
+  if(x->x()->type()->is_float_kind() && !(hasFPU())) {
+      address entry;
+      if (x->x()->type()->is_float()) {
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::fneg);
+      } else {
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::dneg);
+      }
+      LIR_Opr result = call_runtime(x->x(), entry, x->type(), NULL);
+      set_result(x, result);
+  } else
+#endif
+  {
+  LIRItem from(x->x(), this);
+  from.load_item();
+  LIR_Opr result = rlock_result(x);
+  __ negate (from.result(), result);
+  }
+}
+
+// for  _fadd, _fmul, _fsub, _fdiv, _frem
+//      _dadd, _dmul, _dsub, _ddiv, _drem
+void LIRGenerator::do_ArithmeticOp_FPU(ArithmeticOp* x) {
+
+  if (x->op() == Bytecodes::_frem || x->op() == Bytecodes::_drem) {
+    address entry;
+    if (x->op() == Bytecodes::_frem) {
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::frem);
+    } else {
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::drem);
+    }
+    LIR_Opr result = call_runtime(x->x(), x->y(), entry, x->type(), NULL);
+    set_result(x, result);
+
+    return;
+  }
+
+  if(hasFPU()) {
+        LIRItem left(x->x(),  this);
+        LIRItem right(x->y(), this);
+        LIRItem* left_arg  = &left;
+        LIRItem* right_arg = &right;
+
+        // Always load right hand side.
+        right.load_item();
+
+        if (!left.is_register())
+          left.load_item();
+
+        LIR_Opr reg = rlock(x);
+        LIR_Opr tmp = LIR_OprFact::illegalOpr;
+        if (x->is_strictfp() && (x->op() == Bytecodes::_dmul || x->op() == Bytecodes::_ddiv)) {
+          tmp = new_register(T_DOUBLE);
+        }
+
+        arithmetic_op_fpu(x->op(), reg, left.result(), right.result(), NULL);
+
+        set_result(x, round_item(reg));
+  } else {
+#ifdef __SOFTFP__
+    address entry;
+
+    switch (x->op()) {
+      case Bytecodes::_fmul:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::fmul);
+        break;
+      case Bytecodes::_dmul:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::dmul);
+        break;
+      case Bytecodes::_fdiv:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::fdiv);
+        break;
+      case Bytecodes::_ddiv:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::ddiv);
+        break;
+      case Bytecodes::_fadd:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::fadd);
+        break;
+      case Bytecodes::_dadd:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::dadd);
+        break;
+      case Bytecodes::_fsub:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::fsub);
+        break;
+      case Bytecodes::_dsub:
+        entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsub);
+        break;
+      default:
+          ShouldNotReachHere();
+    }
+    LIR_Opr result = call_runtime(x->x(), x->y(),  entry,  x->type(), NULL);
+    set_result(x, result);
+#else
+    ShouldNotReachHere();// check your compiler settings
+#endif
+  }
+}
+
+// for  _ladd, _lmul, _lsub, _ldiv, _lrem
+void LIRGenerator::do_ArithmeticOp_Long(ArithmeticOp* x) {
+
+  // missing test if instr is commutative and if we should swap
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+
+  if (x->op() == Bytecodes::_ldiv || x->op() == Bytecodes::_lrem) {
+
+    BasicTypeList signature(2);
+    signature.append(T_LONG);
+    signature.append(T_LONG);
+    CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+    // check for division by zero (destroys registers of right operand!)
+    CodeEmitInfo* info = state_for(x);
+
+    right.load_item();
+
+    __ cmp(lir_cond_equal, right.result(), LIR_OprFact::longConst(0));
+    __ branch(lir_cond_equal, T_LONG, new DivByZeroStub(info));
+
+    const LIR_Opr result_reg = result_register_for(x->type());
+    left.load_item_force(cc->at(1));
+    __ move(right.result(), cc->at(0));
+
+    address entry;
+    switch (x->op()) {
+    case Bytecodes::_lrem:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::lrem);
+      break; // check if dividend is 0 is done elsewhere
+    case Bytecodes::_ldiv:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::ldiv);
+      break; // check if dividend is 0 is done elsewhere
+    default:
+      ShouldNotReachHere(); return; // unreachable
+    }
+
+    LIR_Opr result = rlock_result(x);
+    __ call_runtime_leaf(entry, getThreadTemp(), result_reg, cc->args());
+    __ move(result_reg, result);
+  } else {
+    assert (x->op() == Bytecodes::_lmul || x->op() == Bytecodes::_ladd || x->op() == Bytecodes::_lsub,
+            "expect lmul, ladd or lsub");
+    // add, sub, mul
+    left.load_item();
+    if (! right.is_register()) {
+      if (x->op() == Bytecodes::_lmul
+          || ! right.is_constant()
+          || ! Assembler::operand_valid_for_add_sub_immediate(right.get_jlong_constant())) {
+        right.load_item();
+      } else { // add, sub
+        assert (x->op() == Bytecodes::_ladd || x->op() == Bytecodes::_lsub, "expect ladd or lsub");
+        // don't load constants to save register
+        right.load_nonconstant();
+      }
+    }
+    rlock_result(x);
+    arithmetic_op_long(x->op(), x->operand(), left.result(), right.result(), NULL);
+  }
+}
+
+// for: _iadd, _imul, _isub, _idiv, _irem
+void LIRGenerator::do_ArithmeticOp_Int(ArithmeticOp* x) {
+
+  // Test if instr is commutative and if we should swap
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+  LIRItem* left_arg = &left;
+  LIRItem* right_arg = &right;
+  if (x->is_commutative() && left.is_stack() && right.is_register()) {
+    // swap them if left is real stack (or cached) and right is real register(not cached)
+    left_arg = &right;
+    right_arg = &left;
+  }
+
+  left_arg->load_item();
+
+  // do not need to load right, as we can handle stack and constants
+  if (x->op() == Bytecodes::_idiv || x->op() == Bytecodes::_irem) {
+
+    right_arg->load_item();
+    rlock_result(x);
+
+    if (!(VM_Version::features() & FT_HW_DIVIDE)) {
+      // MacroAssembler::divide32 destroys both operand registers
+      left_arg->set_destroys_register();
+      right_arg->set_destroys_register();
+    }
+
+    CodeEmitInfo* info = state_for(x);
+    LIR_Opr tmp = new_register(T_INT);
+    __ cmp(lir_cond_equal, right_arg->result(), LIR_OprFact::intConst(0));
+    __ branch(lir_cond_equal, T_INT, new DivByZeroStub(info));
+    info = state_for(x);
+
+    if (x->op() == Bytecodes::_irem) {
+      __ irem(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+    } else if (x->op() == Bytecodes::_idiv) {
+      __ idiv(left_arg->result(), right_arg->result(), x->operand(), tmp, NULL);
+    }
+
+  } else if (x->op() == Bytecodes::_iadd || x->op() == Bytecodes::_isub) {
+    if (right.is_constant()
+        && Assembler::operand_valid_for_add_sub_immediate(right.get_jint_constant())) {
+      right.load_nonconstant();
+    } else {
+      right.load_item();
+    }
+    rlock_result(x);
+    arithmetic_op_int(x->op(), x->operand(), left_arg->result(), right_arg->result(), LIR_OprFact::illegalOpr);
+  } else {
+    assert (x->op() == Bytecodes::_imul, "expect imul");
+    if (right.is_constant()) {
+      jint c = right.get_jint_constant();
+      if (c > 0 && c < max_jint && (is_power_of_2(c) || is_power_of_2(c - 1) || is_power_of_2(c + 1))) {
+        right_arg->dont_load_item();
+      } else {
+        // Cannot use constant op.
+        right_arg->load_item();
+      }
+    } else {
+      right.load_item();
+    }
+    rlock_result(x);
+    arithmetic_op_int(x->op(), x->operand(), left_arg->result(), right_arg->result(), new_register(T_INT));
+  }
+}
+
+void LIRGenerator::do_ArithmeticOp(ArithmeticOp* x) {
+  // when an operand with use count 1 is the left operand, then it is
+  // likely that no move for 2-operand-LIR-form is necessary
+  if (x->is_commutative() && x->y()->as_Constant() == NULL && x->x()->use_count() > x->y()->use_count()) {
+    x->swap_operands();
+  }
+
+  ValueTag tag = x->type()->tag();
+  assert(x->x()->type()->tag() == tag && x->y()->type()->tag() == tag, "wrong parameters");
+  switch (tag) {
+    case floatTag:
+    case doubleTag:  do_ArithmeticOp_FPU(x);  return;
+    case longTag:    do_ArithmeticOp_Long(x); return;
+    case intTag:     do_ArithmeticOp_Int(x);  return;
+  }
+  ShouldNotReachHere();
+}
+
+// _ishl, _lshl, _ishr, _lshr, _iushr, _lushr
+void LIRGenerator::do_ShiftOp(ShiftOp* x) {
+
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+
+  left.load_item();
+
+  rlock_result(x);
+  if (right.is_constant()) {
+    right.dont_load_item();
+
+    switch (x->op()) {
+    case Bytecodes::_ishl: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ shift_left(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_ishr: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_iushr: {
+      int c = right.get_jint_constant() & 0x1f;
+      __ unsigned_shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lshl: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ shift_left(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lshr: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ shift_right(left.result(), c, x->operand());
+      break;
+    }
+    case Bytecodes::_lushr: {
+      int c = right.get_jint_constant() & 0x3f;
+      __ unsigned_shift_right(left.result(), c, x->operand());
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+    }
+  } else {
+    right.load_item();
+    LIR_Opr tmp = LIR_OprFact::illegalOpr;
+    if (left.result()->type() == T_LONG)
+      left.set_destroys_register();
+    switch (x->op()) {
+    case Bytecodes::_ishl: {
+      __ shift_left(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_ishr: {
+      __ shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_iushr: {
+      __ unsigned_shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lshl: {
+      __ shift_left(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lshr: {
+      __ shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    case Bytecodes::_lushr: {
+      __ unsigned_shift_right(left.result(), right.result(), x->operand(), tmp);
+      break;
+    }
+    default:
+      ShouldNotReachHere();
+    }
+  }
+}
+
+// _iand, _land, _ior, _lor, _ixor, _lxor
+void LIRGenerator::do_LogicOp(LogicOp* x) {
+
+  LIRItem left(x->x(),  this);
+  LIRItem right(x->y(), this);
+
+  left.load_item();
+
+  rlock_result(x);
+  if (right.is_constant()
+      && ((right.type()->tag() == intTag
+           && Assembler::operand_valid_for_logical_immediate(true, right.get_jint_constant()))
+          || (right.type()->tag() == longTag
+              && Assembler::operand_valid_for_logical_immediate(false, right.get_jlong_constant()))))  {
+    right.dont_load_item();
+  } else {
+    right.load_item();
+  }
+  switch (x->op()) {
+  case Bytecodes::_iand:
+  case Bytecodes::_land:
+    __ logical_and(left.result(), right.result(), x->operand()); break;
+  case Bytecodes::_ior:
+  case Bytecodes::_lor:
+    __ logical_or (left.result(), right.result(), x->operand()); break;
+  case Bytecodes::_ixor:
+  case Bytecodes::_lxor:
+    __ logical_xor(left.result(), right.result(), x->operand()); break;
+  default: Unimplemented();
+  }
+}
+
+// _lcmp, _fcmpl, _fcmpg, _dcmpl, _dcmpg
+void LIRGenerator::do_CompareOp(CompareOp* x) {
+  LIRItem left(x->x(), this);
+  LIRItem right(x->y(), this);
+  ValueTag tag = x->x()->type()->tag();
+  left.load_item();
+  right.load_item();
+
+  if (x->x()->type()->is_float_kind()) {
+    Bytecodes::Code code = x->op();
+    if(hasFPU()) {
+        LIR_Opr reg = rlock_result(x);
+        __ fcmp2int(left.result(), right.result(), reg, (code == Bytecodes::_fcmpl || code == Bytecodes::_dcmpl));
+    } else {
+#ifdef __SOFTFP__
+        address entry;
+        switch (code) {
+        case Bytecodes::_fcmpl:
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::fcmpl);
+          break;
+        case Bytecodes::_fcmpg:
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::fcmpg);
+          break;
+        case Bytecodes::_dcmpl:
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcmpl);
+          break;
+        case Bytecodes::_dcmpg:
+          entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcmpg);
+          break;
+        default:
+          ShouldNotReachHere();
+        }
+
+        LIR_Opr result = call_runtime(x->x(), x->y(),  entry,  x->type(), NULL);
+        set_result(x, result);
+#else
+        ShouldNotReachHere(); // check your compiler settings
+#endif
+    }
+  } else if (x->x()->type()->tag() == longTag) {
+    LIR_Opr reg = rlock_result(x);
+    __ lcmp2int(left.result(), right.result(), reg);
+  } else {
+    Unimplemented();
+  }
+}
+
+LIR_Opr LIRGenerator::atomic_cmpxchg(BasicType type, LIR_Opr addr, LIRItem& cmp_value, LIRItem& new_value) {
+  LIR_Opr ill = LIR_OprFact::illegalOpr;  // for convenience
+  new_value.load_item();
+  cmp_value.load_item();
+  LIR_Opr result = new_register(T_INT);
+  if (type == T_OBJECT || type == T_ARRAY) {
+    __ cas_obj(addr, cmp_value.result(), new_value.result(), ill, ill, result);
+  } else if (type == T_INT) {
+    __ cas_int(addr, cmp_value.result(), new_value.result(), ill, ill, result);
+  } else if (type == T_LONG) {
+    __ cas_long(addr, cmp_value.result(), new_value.result(), FrameMap::long1_opr, ill, result);
+  } else {
+    ShouldNotReachHere();
+  }
+  __ logical_xor(result, LIR_OprFact::intConst(1), result);
+  return result;
+}
+
+LIR_Opr LIRGenerator::atomic_xchg(BasicType type, LIR_Opr addr, LIRItem& value) {
+  bool is_oop = type == T_OBJECT || type == T_ARRAY;
+  LIR_Opr result = new_register(type);
+  value.load_item();
+  assert(type == T_INT || is_oop, "unexpected type");
+  LIR_Opr tmp = new_register(T_INT);
+  __ xchg(addr, value.result(), result, tmp);
+  return result;
+}
+
+LIR_Opr LIRGenerator::atomic_add(BasicType type, LIR_Opr addr, LIRItem& value) {
+  LIR_Opr result = new_register(type);
+  value.load_item();
+  assert(type == T_INT, "unexpected type");
+  LIR_Opr tmp = new_register(T_INT);
+  __ xadd(addr, value.result(), result, tmp);
+  return result;
+}
+
+void LIRGenerator::do_MathIntrinsic(Intrinsic* x) {
+  switch (x->id()) {
+    default:
+        ShouldNotReachHere();
+        break;
+    case vmIntrinsics::_dabs:
+    case vmIntrinsics::_dsqrt:
+        if(hasFPU()) {
+            assert(x->number_of_arguments() == 1, "wrong type");
+            LIRItem value(x->argument_at(0), this);
+            value.load_item();
+            LIR_Opr dst = rlock_result(x);
+
+            switch (x->id()) {
+            case vmIntrinsics::_dsqrt: {
+              __ sqrt(value.result(), dst, LIR_OprFact::illegalOpr);
+              break;
+            }
+            case vmIntrinsics::_dabs: {
+              __ abs(value.result(), dst, LIR_OprFact::illegalOpr);
+              break;
+            }
+            }
+            break;
+      }// fall through for FPU less cores
+    case vmIntrinsics::_dlog10: // fall through
+    case vmIntrinsics::_dlog: // fall through
+    case vmIntrinsics::_dsin: // fall through
+    case vmIntrinsics::_dtan: // fall through
+    case vmIntrinsics::_dcos: // fall through
+    case vmIntrinsics::_dexp: {
+      assert(x->number_of_arguments() == 1, "wrong type");
+
+      address runtime_entry = NULL;
+      switch (x->id()) {
+#ifdef __SOFTFP__
+      case vmIntrinsics::_dabs:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dabs);
+        break;
+      case vmIntrinsics::_dsqrt:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsqrt);
+        break;
+#endif
+      case vmIntrinsics::_dsin:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+        break;
+      case vmIntrinsics::_dcos:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+        break;
+      case vmIntrinsics::_dtan:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+        break;
+      case vmIntrinsics::_dlog:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+        break;
+      case vmIntrinsics::_dlog10:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+        break;
+      case vmIntrinsics::_dexp:
+        runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+        break;
+      default:
+        ShouldNotReachHere();
+      }
+      LIR_Opr result = call_runtime(x->argument_at(0), runtime_entry, x->type(), NULL);
+      set_result(x, result);
+      break;
+    }
+    case vmIntrinsics::_dpow: {
+      assert(x->number_of_arguments() == 2, "wrong type");
+      address runtime_entry = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+      LIR_Opr result = call_runtime(x->argument_at(0), x->argument_at(1), runtime_entry, x->type(), NULL);
+      set_result(x, result);
+      break;
+    }
+  }
+}
+
+
+void LIRGenerator::do_ArrayCopy(Intrinsic* x) {
+  assert(x->number_of_arguments() == 5, "wrong type");
+
+  // Make all state_for calls early since they can emit code
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIRItem src(x->argument_at(0), this);
+  LIRItem src_pos(x->argument_at(1), this);
+  LIRItem dst(x->argument_at(2), this);
+  LIRItem dst_pos(x->argument_at(3), this);
+  LIRItem length(x->argument_at(4), this);
+
+  // operands for arraycopy must use fixed registers, otherwise
+  // LinearScan will fail allocation (because arraycopy always needs a
+  // call)
+
+  // The java calling convention does not give us enough registers
+  // so we occupy two more: r4 and r5. The fast path code will be able to
+  // make use of these registers for performance purpose. If going into
+  // slow path we'll spill extra data to the stack as necessary
+
+  src.load_item_force     (FrameMap::as_oop_opr(j_rarg0));
+  src_pos.load_item_force (FrameMap::as_opr(j_rarg1));
+  dst.load_item_force     (FrameMap::as_oop_opr(j_rarg2));
+  dst_pos.load_item_force (FrameMap::as_opr(j_rarg3));
+
+  length.load_item_force  (FrameMap::as_opr(r4));
+  LIR_Opr tmp =           FrameMap::as_opr(r5);
+
+  set_no_result(x);
+
+  int flags;
+  ciArrayKlass* expected_type;
+  arraycopy_helper(x, &flags, &expected_type);
+
+  __ arraycopy(src.result(), src_pos.result(), dst.result(), dst_pos.result(), length.result(), tmp, expected_type, flags, info); // does add_safepoint
+}
+
+void LIRGenerator::do_update_CRC32_inner(Intrinsic* x, int is_crc32c) {
+  assert(!is_crc32c ? UseCRC32Intrinsics : UseCRC32CIntrinsics, "why are we here?");
+  // Make all state_for calls early since they can emit code
+  LIR_Opr result = rlock_result(x);
+  switch (x->id()) {
+    case vmIntrinsics::_updateCRC32: {
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem val(x->argument_at(1), this);
+      // val is destroyed by update_crc32
+      val.set_destroys_register();
+      crc.load_item();
+      val.load_item();
+      __ update_crc32(crc.result(), val.result(), result);
+      break;
+    }
+    case vmIntrinsics::_updateBytesCRC32:
+    case vmIntrinsics::_updateByteBufferCRC32:
+        assert(!is_crc32c, "why are we here?");
+    case vmIntrinsics::_updateBytesCRC32C:
+    case vmIntrinsics::_updateDirectByteBufferCRC32C:
+    {
+      if (is_crc32c) {
+        assert(x->id() == vmIntrinsics::_updateBytesCRC32C ||
+               x->id() == vmIntrinsics::_updateDirectByteBufferCRC32C, "why are we here?");
+      }
+      bool is_updateBytes = (x->id() == vmIntrinsics::_updateBytesCRC32) ||
+                            (x->id() == vmIntrinsics::_updateBytesCRC32C);
+
+      LIRItem crc(x->argument_at(0), this);
+      LIRItem buf(x->argument_at(1), this);
+      LIRItem off(x->argument_at(2), this);
+      LIRItem len(x->argument_at(3), this); // length, or end in case of crc32c
+      buf.load_item();
+      off.load_nonconstant();
+
+      LIR_Opr index = off.result();
+      int offset = is_updateBytes ? arrayOopDesc::base_offset_in_bytes(T_BYTE) : 0;
+      if(off.result()->is_constant()) {
+        index = LIR_OprFact::illegalOpr;
+       offset += off.result()->as_jint();
+      }
+      LIR_Opr base_op = buf.result();
+
+      if (!is_updateBytes) { // long b raw address
+         base_op = new_register(T_INT);
+         __ convert(Bytecodes::_l2i, buf.result(), base_op);
+      }
+
+      if (offset) {
+        LIR_Opr tmp = new_pointer_register();
+        __ add(base_op, LIR_OprFact::intConst(offset), tmp);
+        base_op = tmp;
+        offset = 0;
+      }
+
+      LIR_Address* a = new LIR_Address(base_op,
+                                       index,
+                                       offset,
+                                       T_BYTE);
+      BasicTypeList signature(3);
+      signature.append(T_INT);
+      signature.append(T_ADDRESS);
+      signature.append(T_INT);
+      CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+      const LIR_Opr result_reg = result_register_for(x->type());
+
+      LIR_Opr addr = new_pointer_register();
+      __ leal(LIR_OprFact::address(a), addr);
+
+      crc.load_item_force(cc->at(0));
+      __ move(addr, cc->at(1));
+
+      if (!is_crc32c) {
+        len.load_item_force(cc->at(2));
+      } else {
+        __ sub(len.result(), off.result(), cc->at(2));
+      }
+
+      __ call_runtime_leaf(
+            !is_crc32c ?
+                StubRoutines::updateBytesCRC32() :
+                StubRoutines::updateBytesCRC32C(),
+            getThreadTemp(), result_reg, cc->args());
+      __ move(result_reg, result);
+
+      break;
+    }
+    default: {
+      ShouldNotReachHere();
+    }
+  }
+}
+
+void LIRGenerator::do_update_CRC32(Intrinsic* x) {
+  do_update_CRC32_inner(x, false);
+}
+
+void LIRGenerator::do_update_CRC32C(Intrinsic* x) {
+  do_update_CRC32_inner(x, true);
+}
+
+void LIRGenerator::do_aescrypt_block(Intrinsic* x) {
+  assert(UseAESIntrinsics, "why are we here?");
+
+  // first argument is object itself
+  LIRItem obj(x->argument_at(0), this);
+  LIRItem from(x->argument_at(1), this);
+  LIRItem foff(x->argument_at(2), this);
+  LIRItem to(x->argument_at(3), this);
+  LIRItem toff(x->argument_at(4), this);
+  LIR_Opr addr = new_pointer_register();
+
+  BasicTypeList signature(3);
+  signature.append(T_ADDRESS);
+  signature.append(T_ADDRESS);
+  signature.append(T_ADDRESS);
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  // From buffer
+  LIR_Address* a;
+  if (foff.result()->is_constant()) {
+    jint c = foff.result()->as_jint();
+    a = new LIR_Address(from.result(),
+                        c,
+                        T_BYTE);
+  } else {
+    a = new LIR_Address(from.result(),
+                        foff.result(),
+                        LIR_Address::times_1,
+                        0,
+                        T_BYTE);
+  }
+  __ leal(LIR_OprFact::address(a), addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(0));
+
+  // To buffer
+  if (toff.result()->is_constant()) {
+    jint c = toff.result()->as_jint();
+    a = new LIR_Address(to.result(),
+                        c,
+                        T_BYTE);
+  } else {
+    a = new LIR_Address(to.result(),
+                        toff.result(),
+                        LIR_Address::times_1,
+                        0,
+                        T_BYTE);
+  }
+  __ leal(LIR_OprFact::address(a), addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(1));
+
+  // Key
+  LIR_Address* k = new LIR_Address(obj.result(),
+                        com_sun_crypto_provider_AESCrypt::K_offset(),
+                        T_OBJECT);
+
+  __ load(k, addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(2));
+
+  set_no_result(x);
+
+  switch (x->id()) {
+    case vmIntrinsics::_aescrypt_encryptBlock:
+    {
+      __ call_runtime_leaf(StubRoutines::aescrypt_encryptBlock(), getThreadTemp(), LIR_OprFact::illegalOpr, cc->args());
+      break;
+    }
+    case vmIntrinsics::_aescrypt_decryptBlock:
+    {
+      __ call_runtime_leaf(StubRoutines::aescrypt_decryptBlock(), getThreadTemp(), LIR_OprFact::illegalOpr, cc->args());
+      break;
+    }
+    default:
+    {
+      ShouldNotReachHere();
+    }
+  }
+}
+
+// This method is called in the C1 Xcom mode
+void LIRGenerator::do_aescrypt_cbc(Intrinsic* x) {
+  assert(UseAESIntrinsics && UseNeon, "why are we here?");
+
+  LIRItem obj(x->argument_at(0), this);
+  LIRItem from(x->argument_at(1), this);
+  LIRItem foff(x->argument_at(2), this);
+  LIRItem flen(x->argument_at(3), this);
+  LIRItem to(x->argument_at(4), this);
+  LIRItem toff(x->argument_at(5), this);
+  LIR_Opr addr = new_pointer_register();
+
+  // force to load len into r4
+  flen.load_item_force  (FrameMap::as_opr(r4));
+
+  BasicTypeList signature(5);
+  signature.append(T_ADDRESS);  //from
+  signature.append(T_ADDRESS);  //to
+  signature.append(T_ADDRESS);  //key
+  signature.append(T_ADDRESS);  //rvec
+  signature.append(T_INT);      //len
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  // From buffer
+  LIR_Address* a;
+  a = new LIR_Address(from.result(),T_OBJECT);
+  __ leal(LIR_OprFact::address(a), addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), addr);
+  __ add(addr, foff.result(), cc->at(0));
+
+  // To buffer
+  a = new LIR_Address(to.result(),T_OBJECT);
+  __ leal(LIR_OprFact::address(a), addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), addr);
+  __ add(addr, toff.result(), cc->at(1));
+  // key
+  a = new LIR_Address(obj.result(),
+                        com_sun_crypto_provider_FeedbackCipher::embeddedCipher_offset(),
+                        T_OBJECT);
+  __ load(a, addr);
+  __ add(addr, LIR_OprFact::intConst(com_sun_crypto_provider_AESCrypt::K_offset()), addr);
+  a = new LIR_Address(addr, T_OBJECT);
+  __ load(a, addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(2));
+
+  // rvec
+  a = new LIR_Address(obj.result(),
+                        com_sun_crypto_provider_CipherBlockChaining::r_offset(),
+                        T_OBJECT);
+  __ load(a, addr);
+  __ add(addr, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(3));
+
+  //input len
+  __ move(flen.result(), cc->at(4));
+
+  LIR_Opr result = rlock_result(x);
+  const LIR_Opr result_reg = result_register_for(x->type());
+
+  switch (x->id()) {
+    case vmIntrinsics::_cipherBlockChaining_encryptAESCrypt:
+    {
+      __ call_runtime_leaf(StubRoutines::cipherBlockChaining_encryptAESCrypt_special(), getThreadTemp(), result_reg, cc->args());
+      __ move(result_reg, result);
+      break;
+    }
+    case vmIntrinsics::_cipherBlockChaining_decryptAESCrypt:
+    {
+      __ call_runtime_leaf(StubRoutines::cipherBlockChaining_decryptAESCrypt_special(), getThreadTemp(), result_reg, cc->args());
+      __ move(result_reg, result);
+      break;
+    }
+    default:
+    {
+      ShouldNotReachHere();
+    }
+
+  }
+}
+
+// This method is called in the C1 Xcom mode
+void LIRGenerator::do_sha(Intrinsic* x) {
+  assert(UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics, "why are we here?");
+
+  // first argument is object itself
+  LIRItem obj(x->argument_at(0), this);
+  LIRItem from(x->argument_at(1), this);
+  LIRItem foff(x->argument_at(2), this);
+
+  BasicTypeList signature(2);
+  signature.append(T_ADDRESS);
+  signature.append(T_ADDRESS);
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  // From buffer
+  LIR_Address* a;
+  if (foff.result()->is_constant()) {
+    jint c = foff.result()->as_jint();
+    a = new LIR_Address(from.result(),
+                        c,
+                        T_BYTE);
+  } else {
+    a = new LIR_Address(from.result(),
+                        foff.result(),
+                        LIR_Address::times_1,
+                        0,
+                        T_BYTE);
+  }
+  LIR_Opr addr_from = new_pointer_register();
+  __ leal(LIR_OprFact::address(a), addr_from);
+  __ add(addr_from, LIR_OprFact::intConst(arrayOopDesc::base_offset_in_bytes(T_BYTE)), cc->at(0));
+
+
+  // State
+  int state_offset;
+  int state_data_offset;
+  address stub_addr;
+  switch (x->id()) {
+    case vmIntrinsics::_sha_implCompress:
+      state_offset = sun_security_provider_SHA2::state_offset();
+      state_data_offset = arrayOopDesc::base_offset_in_bytes(T_INT);
+      stub_addr = StubRoutines::sha1_implCompress();
+      break;
+    case vmIntrinsics::_sha2_implCompress:
+      state_offset = sun_security_provider_SHA2::state_offset();
+      state_data_offset = arrayOopDesc::base_offset_in_bytes(T_INT);
+      stub_addr = StubRoutines::sha256_implCompress();
+      break;
+    case vmIntrinsics::_sha5_implCompress:
+      state_offset = sun_security_provider_SHA5::state_offset();
+      state_data_offset = arrayOopDesc::base_offset_in_bytes(T_LONG);
+      stub_addr = StubRoutines::sha512_implCompress();
+      break;
+    default:
+      ShouldNotReachHere();
+      return; // unreachable
+  }
+
+  LIR_Address* state = new LIR_Address(obj.result(), state_offset, T_OBJECT);
+
+  LIR_Opr addr_state = new_pointer_register();
+  __ load(state, addr_state);
+  __ add(addr_state, LIR_OprFact::intConst(state_data_offset), cc->at(1));
+
+  set_no_result(x);
+
+  __ call_runtime_leaf(stub_addr, getThreadTemp(), LIR_OprFact::illegalOpr, cc->args());
+
+}
+
+void LIRGenerator::do_montgomery_intrinsic(Intrinsic* x) {
+  bool squaring = x->id() == vmIntrinsics::_montgomerySquare;
+  int n_arg_idx = squaring ? 1 : 2;
+  assert(squaring ? UseMontgomerySquareIntrinsic : UseMontgomeryMultiplyIntrinsic, "why are we here?");
+
+  LIRItem a(x->argument_at(0), this);
+  LIRItem n(x->argument_at(n_arg_idx), this);
+  LIRItem len(x->argument_at(n_arg_idx+1), this);
+  LIRItem inv(x->argument_at(n_arg_idx+2), this);
+  LIRItem product(x->argument_at(n_arg_idx+3), this);
+
+  BasicTypeList signature(squaring ? 5 : 6);
+  signature.append(T_ADDRESS);
+  if (!squaring)
+    signature.append(T_ADDRESS);
+  signature.append(T_ADDRESS);
+  signature.append(T_INT);
+  signature.append(T_LONG);
+  signature.append(T_ADDRESS);
+  CallingConvention* cc = frame_map()->c_calling_convention(&signature);
+
+  // A array, c_rarg0
+  __ leal(LIR_OprFact::address(emit_array_address(a.result(), LIR_OprFact::intConst(0), T_INT)), cc->at(0));
+  if (!squaring) {
+    LIRItem b(x->argument_at(1), this);
+    // B array, c_rarg1
+    __ leal(LIR_OprFact::address(emit_array_address(b.result(), LIR_OprFact::intConst(0), T_INT)), cc->at(1));
+  }
+  // N array, c_rarg2(1)
+  __ leal(LIR_OprFact::address(emit_array_address(n.result(), LIR_OprFact::intConst(0), T_INT)), cc->at(n_arg_idx));
+  // len, c_rarg3(2)
+  assert(cc->at(n_arg_idx+1)->is_cpu_register(), "assumed");
+  __ move(len.result(), cc->at(n_arg_idx+1));
+  // inv, stack slot
+  assert(cc->at(n_arg_idx+2)->is_address(), "assumed");
+  __ move(inv.result(), cc->at(n_arg_idx+2));
+  // M array, stack slot
+  LIR_Opr addr = new_pointer_register();
+  __ leal(LIR_OprFact::address(emit_array_address(product.result(), LIR_OprFact::intConst(0), T_INT)), addr);
+  __ move(addr, cc->at(n_arg_idx+3));
+
+  set_result(x, product.result());
+
+  switch (x->id()) {
+    case vmIntrinsics::_montgomeryMultiply:
+    {
+      __ call_runtime_leaf(StubRoutines::montgomeryMultiply(), getThreadTemp(), LIR_OprFact::illegalOpr, cc->args());
+      break;
+    }
+    case vmIntrinsics::_montgomerySquare:
+    {
+      __ call_runtime_leaf(StubRoutines::montgomerySquare(), getThreadTemp(), LIR_OprFact::illegalOpr, cc->args());
+      break;
+    }
+    default:
+    {
+      ShouldNotReachHere();
+    }
+  }
+}
+
+void LIRGenerator::do_FmaIntrinsic(Intrinsic* x) {
+  Unimplemented();
+}
+
+void LIRGenerator::do_vectorizedMismatch(Intrinsic* x) {
+  fatal("vectorizedMismatch intrinsic is not implemented on this platform");
+}
+
+// _i2l, _i2f, _i2d, _l2i, _l2f, _l2d, _f2i, _f2l, _f2d, _d2i, _d2l, _d2f
+// _i2b, _i2c, _i2s
+void LIRGenerator::do_Convert(Convert* x) {
+    address entry = NULL;
+  switch (x->op()) {
+  case Bytecodes::_d2i:
+  case Bytecodes::_f2i:
+  case Bytecodes::_i2f:
+  case Bytecodes::_i2d:
+  case Bytecodes::_f2d:
+  case Bytecodes::_d2f:
+      if(hasFPU()) {
+          break;
+      }// fall through for FPU-less cores
+  case Bytecodes::_d2l:
+  case Bytecodes::_f2l:
+  case Bytecodes::_l2d:
+  case Bytecodes::_l2f: {
+
+    switch (x->op()) {
+#ifdef __SOFTFP__
+    case Bytecodes::_i2f:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::i2f);
+      break;
+    case Bytecodes::_i2d:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::i2d);
+      break;
+    case Bytecodes::_f2d:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::f2d);
+      break;
+    case Bytecodes::_d2f:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::d2f);
+      break;
+    case Bytecodes::_d2i:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::d2i);
+      break;
+    case Bytecodes::_f2i:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::f2i);
+      break;
+#endif
+    case Bytecodes::_d2l:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::d2l);
+      break;
+    case Bytecodes::_f2l:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::f2l);
+      break;
+    case Bytecodes::_l2d:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::l2d);
+      break;
+    case Bytecodes::_l2f:
+      entry = CAST_FROM_FN_PTR(address, SharedRuntime::l2f);
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    LIR_Opr result = call_runtime(x->value(), entry, x->type(), NULL);
+    set_result(x, result);
+  }
+  break;
+
+  default:
+    break;
+}
+    if(NULL == entry) {
+    LIRItem value(x->value(), this);
+    value.load_item();
+
+    if (x->op() == Bytecodes::_f2i || x->op() == Bytecodes::_d2i) {
+      value.set_destroys_register();
+    }
+
+    LIR_Opr input = value.result();
+    LIR_Opr result = rlock(x);
+
+    __ convert(x->op(), input, result);
+
+    assert(result->is_virtual(), "result must be virtual register");
+    set_result(x, result);
+  }
+}
+
+void LIRGenerator::do_NewInstance(NewInstance* x) {
+#ifndef PRODUCT
+  if (PrintNotLoaded && !x->klass()->is_loaded()) {
+    tty->print_cr("   ###class not loaded at new bci %d", x->printable_bci());
+  }
+#endif
+  CodeEmitInfo* info = state_for(x, x->state());
+  LIR_Opr reg = result_register_for(x->type());
+  new_instance(reg, x->klass(), x->is_unresolved(),
+                       FrameMap::r2_oop_opr,
+                       FrameMap::r5_oop_opr,
+                       FrameMap::r4_oop_opr,
+                       LIR_OprFact::illegalOpr,
+                       FrameMap::r3_metadata_opr, info);
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_NewTypeArray(NewTypeArray* x) {
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIRItem length(x->length(), this);
+  length.load_item_force(FrameMap::r6_opr);
+
+  LIR_Opr reg = result_register_for(x->type());
+  LIR_Opr tmp1 = FrameMap::r2_oop_opr;
+  LIR_Opr tmp2 = FrameMap::r4_oop_opr;
+  LIR_Opr tmp3 = FrameMap::r5_oop_opr;
+  LIR_Opr tmp4 = reg;
+  LIR_Opr klass_reg = FrameMap::r3_metadata_opr;
+  LIR_Opr len = length.result();
+  BasicType elem_type = x->elt_type();
+
+  __ metadata2reg(ciTypeArrayKlass::make(elem_type)->constant_encoding(), klass_reg);
+
+  CodeStub* slow_path = new NewTypeArrayStub(klass_reg, len, reg, info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, elem_type, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_NewObjectArray(NewObjectArray* x) {
+  LIRItem length(x->length(), this);
+  // in case of patching (i.e., object class is not yet loaded), we need to reexecute the instruction
+  // and therefore provide the state before the parameters have been consumed
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info =  state_for(x, x->state_before());
+  }
+
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  LIR_Opr reg = result_register_for(x->type());
+  LIR_Opr tmp1 = FrameMap::r2_oop_opr;
+  LIR_Opr tmp2 = FrameMap::r4_oop_opr;
+  LIR_Opr tmp3 = FrameMap::r5_oop_opr;
+  LIR_Opr tmp4 = reg;
+  LIR_Opr klass_reg = FrameMap::r3_metadata_opr;
+
+  length.load_item_force(FrameMap::r6_opr);
+  LIR_Opr len = length.result();
+
+  CodeStub* slow_path = new NewObjectArrayStub(klass_reg, len, reg, info);
+  ciKlass* obj = (ciKlass*) ciObjArrayKlass::make(x->klass());
+  if (obj == ciEnv::unloaded_ciobjarrayklass()) {
+    BAILOUT("encountered unloaded_ciobjarrayklass due to out of memory error");
+  }
+  klass2reg_with_patching(klass_reg, obj, patching_info);
+  __ allocate_array(reg, len, tmp1, tmp2, tmp3, tmp4, T_OBJECT, klass_reg, slow_path);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+
+void LIRGenerator::do_NewMultiArray(NewMultiArray* x) {
+  Values* dims = x->dims();
+  int i = dims->length();
+  LIRItemList* items = new LIRItemList(i, i, NULL);
+  while (i-- > 0) {
+    LIRItem* size = new LIRItem(dims->at(i), this);
+    items->at_put(i, size);
+  }
+
+  // Evaluate state_for early since it may emit code.
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || PatchALot) {
+    patching_info = state_for(x, x->state_before());
+
+    // Cannot re-use same xhandlers for multiple CodeEmitInfos, so
+    // clone all handlers (NOTE: Usually this is handled transparently
+    // by the CodeEmitInfo cloning logic in CodeStub constructors but
+    // is done explicitly here because a stub isn't being used).
+    x->set_exception_handlers(new XHandlers(x->exception_handlers()));
+  }
+  CodeEmitInfo* info = state_for(x, x->state());
+
+  i = dims->length();
+  while (i-- > 0) {
+    LIRItem* size = items->at(i);
+    size->load_item();
+
+    store_stack_parameter(size->result(), in_ByteSize(i*4));
+  }
+
+  LIR_Opr klass_reg = FrameMap::r1_metadata_opr;
+  klass2reg_with_patching(klass_reg, x->klass(), patching_info);
+
+  LIR_Opr rank = FrameMap::r2_opr;
+  __ move(LIR_OprFact::intConst(x->rank()), rank);
+  LIR_Opr varargs = FrameMap::r3_opr;
+  __ move(FrameMap::sp_opr, varargs);
+  LIR_OprList* args = new LIR_OprList(3);
+  args->append(klass_reg);
+  args->append(rank);
+  args->append(varargs);
+  LIR_Opr reg = result_register_for(x->type());
+  __ call_runtime(Runtime1::entry_for(Runtime1::new_multi_array_id),
+                  LIR_OprFact::illegalOpr,
+                  reg, args, info);
+
+  LIR_Opr result = rlock_result(x);
+  __ move(reg, result);
+}
+
+void LIRGenerator::do_BlockBegin(BlockBegin* x) {
+  // nothing to do for now
+}
+
+void LIRGenerator::do_CheckCast(CheckCast* x) {
+  LIRItem obj(x->obj(), this);
+
+  CodeEmitInfo* patching_info = NULL;
+  if (!x->klass()->is_loaded() || (PatchALot && !x->is_incompatible_class_change_check() && !x->is_invokespecial_receiver_check())) {
+    // must do this before locking the destination register as an oop register,
+    // and before the obj is loaded (the latter is for deoptimization)
+    patching_info = state_for(x, x->state_before());
+  }
+  obj.load_item();
+
+  // info for exceptions
+  CodeEmitInfo* info_for_exception =
+     (x->needs_exception_state() ? state_for(x) :
+     state_for(x, x->state_before(), true /*ignore_xhandler*/));
+
+  CodeStub* stub;
+  if (x->is_incompatible_class_change_check()) {
+    assert(patching_info == NULL, "can't patch this");
+    stub = new SimpleExceptionStub(Runtime1::throw_incompatible_class_change_error_id, LIR_OprFact::illegalOpr, info_for_exception);
+  } else if (x->is_invokespecial_receiver_check()) {
+    assert(patching_info == NULL, "can't patch this");
+    stub = new DeoptimizeStub(info_for_exception,
+                              Deoptimization::Reason_class_check,
+                              Deoptimization::Action_none);
+  } else {
+    stub = new SimpleExceptionStub(Runtime1::throw_class_cast_exception_id, obj.result(), info_for_exception);
+  }
+  LIR_Opr reg = rlock_result(x);
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+  if (!x->klass()->is_loaded()) {
+    tmp3 = new_register(objectType);
+  }
+  __ checkcast(reg, obj.result(), x->klass(),
+               new_register(objectType), new_register(objectType), tmp3,
+               x->direct_compare(), info_for_exception, patching_info, stub,
+               x->profiled_method(), x->profiled_bci());
+}
+
+void LIRGenerator::do_InstanceOf(InstanceOf* x) {
+  LIRItem obj(x->obj(), this);
+
+  // result and test object may not be in same register
+  LIR_Opr reg = rlock_result(x);
+  CodeEmitInfo* patching_info = NULL;
+  if ((!x->klass()->is_loaded() || PatchALot)) {
+    // must do this before locking the destination register as an oop register
+    patching_info = state_for(x, x->state_before());
+  }
+  obj.load_item();
+  LIR_Opr tmp3 = LIR_OprFact::illegalOpr;
+  if (!x->klass()->is_loaded()) {
+    tmp3 = new_register(objectType);
+  }
+  __ instanceof(reg, obj.result(), x->klass(),
+                new_register(objectType), new_register(objectType), tmp3,
+                x->direct_compare(), patching_info, x->profiled_method(), x->profiled_bci());
+}
+
+void LIRGenerator::do_If(If* x) {
+  assert(x->number_of_sux() == 2, "inconsistency");
+  ValueTag tag = x->x()->type()->tag();
+
+  If::Condition cond = x->cond();
+
+  LIRItem xitem(x->x(), this);
+  LIRItem yitem(x->y(), this);
+  LIRItem* xin = &xitem;
+  LIRItem* yin = &yitem;
+
+  xin->load_item();
+
+  if (yin->is_constant()) {
+    if (tag == longTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_jlong_constant())) {
+      yin->dont_load_item();
+    } else if (tag == intTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_jint_constant())) {
+      yin->dont_load_item();
+    } else if (tag == addressTag
+        && Assembler::operand_valid_for_add_sub_immediate(yin->get_address_constant())) {
+      yin->dont_load_item();
+    } else if (tag == objectTag && yin->get_jobject_constant()->is_null_object()) {
+      yin->dont_load_item();
+    } else {
+      yin->load_item();
+    }
+  } else {
+    yin->load_item();
+  }
+
+  set_no_result(x);
+
+  LIR_Opr left = xin->result();
+  LIR_Opr right = yin->result();
+  LIR_Condition lir_c = lir_cond(cond);
+
+  // add safepoint before generating condition code so it can be recomputed
+  if (x->is_safepoint()) {
+    // increment backedge counter if needed
+    increment_backedge_counter_conditionally(lir_cond(cond), left, right, state_for(x, x->state_before()),
+        x->tsux()->bci(), x->fsux()->bci(), x->profiled_bci());
+    __ safepoint(LIR_OprFact::illegalOpr, state_for(x, x->state_before()));
+  }
+
+
+#ifdef __SOFTFP__
+  if(x->x()->type()->is_float_kind() && !(hasFPU())) {// FPU-less cores
+    address entry;
+    bool unordered_flag = x->unordered_is_true() != (lir_c == lir_cond_greater || lir_c == lir_cond_lessEqual);
+    if (x->x()->type()->is_float()) {
+      entry = CAST_FROM_FN_PTR(address, unordered_flag ? SharedRuntime::fcmpg : SharedRuntime::fcmpl);
+    } else if (x->x()->type()->is_double()) {
+      entry = CAST_FROM_FN_PTR(address, unordered_flag ? SharedRuntime::dcmpg : SharedRuntime::dcmpl);
+    } else {
+        ShouldNotReachHere();
+    }
+
+    LIR_Opr fcmp_res = call_runtime(x->x(), x->y(), entry, intType, NULL);
+    LIR_Opr zero = LIR_OprFact::intConst(0);
+    __ cmp(lir_c, fcmp_res, zero);
+  } else
+#endif
+  {
+  __ cmp(lir_c, left, right);
+  }
+
+  // Generate branch profiling. Profiling code doesn't kill flags.
+  profile_branch(x, cond);
+  move_to_phi(x->state());
+
+  if (x->x()->type()->is_float_kind()) {
+      if(hasFPU()) {
+        __ branch(lir_c, right->type(), x->tsux(), x->usux());
+      } else {
+        __ branch(lir_c, T_INT, x->tsux());
+      }
+  } else
+  {
+    __ branch(lir_c, right->type(), x->tsux());
+  }
+  assert(x->default_sux() == x->fsux(), "wrong destination above");
+  __ jump(x->default_sux());
+}
+
+LIR_Opr LIRGenerator::getThreadPointer() {
+   return FrameMap::as_pointer_opr(rthread);
+}
+
+void LIRGenerator::trace_block_entry(BlockBegin* block) {
+  __ move(LIR_OprFact::intConst(block->block_id()), FrameMap::r0_opr);
+  LIR_OprList* args = new LIR_OprList(1);
+  args->append(FrameMap::r0_opr);
+  address func = CAST_FROM_FN_PTR(address, Runtime1::trace_block_entry);
+  __ call_runtime_leaf(func, LIR_OprFact::illegalOpr, LIR_OprFact::illegalOpr, args);
+}
+
+void LIRGenerator::volatile_field_store(LIR_Opr value, LIR_Address* address,
+                                        CodeEmitInfo* info) {
+  if (value->is_double_cpu()) {
+    __ move(value, FrameMap::long0_opr);
+    __ volatile_store_mem_reg(FrameMap::long0_opr, address, info);
+  } else {
+    __ volatile_store_mem_reg(value, address, info);
+  }
+}
+
+void LIRGenerator::volatile_field_load(LIR_Address* address, LIR_Opr result,
+                                       CodeEmitInfo* info) {
+  if (result->is_double_cpu()) {
+    __ volatile_load_mem_reg(address, FrameMap::long0_opr, info);
+    __ move(FrameMap::long0_opr, result);
+  } else {
+    __ volatile_load_mem_reg(address, result, info);
+  }
+}
--- /dev/null	2018-09-25 19:24:26.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LIR_aarch32.cpp	2018-09-25 19:24:26.000000000 +0300
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/register.hpp"
+#include "c1/c1_LIR.hpp"
+
+FloatRegister LIR_OprDesc::as_float_reg() const {
+  return as_FloatRegister(fpu_regnr());
+}
+
+FloatRegister LIR_OprDesc::as_double_reg() const {
+  return as_FloatRegister(fpu_regnrLo());
+}
+
+// Reg2 unused.
+LIR_Opr LIR_OprFact::double_fpu(int reg1, int reg2) {
+  assert(as_FloatRegister(reg2) != fnoreg, "aarch32 holds double in two regs.");
+  return (LIR_Opr)(intptr_t)((reg1 << LIR_OprDesc::reg1_shift) |
+                             (reg2 << LIR_OprDesc::reg2_shift) |
+                             LIR_OprDesc::double_type          |
+                             LIR_OprDesc::fpu_register         |
+                             LIR_OprDesc::double_size);
+}
+
+#ifndef PRODUCT
+void LIR_Address::verify() const {
+  assert(base()->is_cpu_register(), "wrong base operand");
+  assert(index()->is_illegal() || index()->is_double_cpu() || index()->is_single_cpu(), "wrong index operand");
+  assert(base()->type() == T_OBJECT || base()->type() == T_INT || base()->type() == T_METADATA,
+         "wrong type for addresses");
+}
+#endif // PRODUCT
+
--- /dev/null	2018-09-25 19:24:27.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LinearScan_aarch32.cpp	2018-09-25 19:24:27.000000000 +0300
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_LinearScan.hpp"
+
+void LinearScan::allocate_fpu_stack() {
+  // No FPU stack on AArch32
+}
--- /dev/null	2018-09-25 19:24:29.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_LinearScan_aarch32.hpp	2018-09-25 19:24:28.000000000 +0300
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
+
+inline bool LinearScan::is_processed_reg_num(int reg_num) {
+  return reg_num <= pd_last_cpu_reg || reg_num >= pd_nof_cpu_regs_frame_map;
+}
+
+inline int LinearScan::num_physical_regs(BasicType type) {
+  if (type == T_LONG || type == T_DOUBLE) {
+    return 2;
+  }
+  return 1;
+}
+
+inline bool LinearScan::requires_adjacent_regs(BasicType type) {
+  return type == T_DOUBLE;
+}
+
+inline bool LinearScan::is_caller_save(int assigned_reg) {
+  assert(assigned_reg >= 0 && assigned_reg < nof_regs,
+         "should call this only for registers");
+  // TODO: Remove the following line when support for callee-saved registers
+  // is added
+  return true;
+  if (assigned_reg < pd_first_callee_saved_cpu_reg) {
+    return true;
+  }
+  if (assigned_reg > pd_last_callee_saved_cpu_reg &&
+      assigned_reg < pd_first_callee_saved_fpu_reg) {
+    return true;
+  }
+  if (assigned_reg > pd_last_callee_saved_fpu_reg &&
+      assigned_reg <= pd_last_fpu_reg) {
+    return true;
+  }
+  return false;
+}
+
+// If there are special cases when some particular LIR operations kill some
+// specific registers, this behavior should be described here. An example
+// can be found in x86 port.
+inline void LinearScan::pd_add_temps(LIR_Op* op) {
+  if (op->code() == lir_move) {
+    LIR_Op1* move_op = op->as_Op1();
+    if (move_op->move_kind() == lir_move_volatile) {
+      bool is_long = move_op->type() == T_LONG;
+      bool is_double = move_op->type() == T_DOUBLE;
+      bool is_store = move_op->in_opr()->is_register();
+      if (is_double) {
+        add_temp(reg_num(FrameMap::long0_opr), op->id(), noUse, T_ILLEGAL);
+        add_temp(reg_numHi(FrameMap::long0_opr), op->id(), noUse, T_ILLEGAL);
+      }
+      if (is_store && (is_long || is_double)) {
+        add_temp(reg_num(FrameMap::long1_opr), op->id(), noUse, T_ILLEGAL);
+        add_temp(reg_numHi(FrameMap::long1_opr), op->id(), noUse, T_ILLEGAL);
+      }
+    }
+  }
+}
+
+inline bool LinearScanWalker::pd_init_regs_for_alloc(Interval* cur) {
+#ifndef HARD_FLOAT_CC
+    BasicType type = cur->type();
+    if(!hasFPU()) {
+        if (type == T_FLOAT || type == T_DOUBLE) {
+            _first_reg = pd_first_cpu_reg;
+            _last_reg = FrameMap::last_cpu_reg();;
+            return true;
+        }
+    }
+#endif
+  return false;
+}
+
+#endif // CPU_AARCH32_VM_C1_LINEARSCAN_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:30.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_MacroAssembler_aarch32.cpp	2018-09-25 19:24:29.000000000 +0300
@@ -0,0 +1,434 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "interpreter/interpreter.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/os.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+
+void C1_MacroAssembler::float_cmp(bool is_float, int unordered_result,
+                                  FloatRegister f0, FloatRegister f1,
+                                  Register result)
+{
+  Label done;
+  if (is_float) {
+    vcmp_f32(f0, f1);
+  } else {
+    vcmp_f64(f0, f1);
+  }
+
+  get_fpsr();
+
+  mov(result, 0);
+  if (unordered_result < 0) {
+    // we want -1 for unordered or less than, 0 for equal and 1 for
+    // greater than.
+    mov(result, 1, NE); // Not equal or unordered
+    neg(result, result, LT);  // Less than or unordered
+  } else {
+    // we want -1 for less than, 0 for equal and 1 for unordered or
+    // greater than.
+    mov(result, 1, NE); // Not equal or unordered
+    neg(result, result, LO);  // Less than
+  }
+}
+
+int C1_MacroAssembler::lock_object(Register hdr, Register obj, Register disp_hdr, Register scratch, Label& slow_case) {
+  const int aligned_mask = BytesPerWord -1;
+  const int hdr_offset = oopDesc::mark_offset_in_bytes();
+  assert(hdr != obj && hdr != disp_hdr && obj != disp_hdr, "registers must be different");
+  Label done, fail;
+  int null_check_offset = -1;
+
+  verify_oop(obj);
+
+  // save object being locked into the BasicObjectLock
+  str(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+
+  if (UseBiasedLocking) {
+    assert(scratch != noreg, "should have scratch register at this point");
+    null_check_offset = biased_locking_enter(obj, hdr, scratch, rscratch1, false, done, &slow_case);
+  } else {
+    null_check_offset = offset();
+  }
+
+  // Load object header
+  ldr(hdr, Address(obj, hdr_offset));
+  // and mark it as unlocked
+  orr(hdr, hdr, markOopDesc::unlocked_value);
+  // save unlocked object header into the displaced header location on the stack
+  str(hdr, Address(disp_hdr, 0));
+  // test if object header is still the same (i.e. unlocked), and if so, store the
+  // displaced header address in the object header - if it is not the same, get the
+  // object header instead
+  lea(rscratch2, Address(obj, hdr_offset));
+  cmpxchgptr(hdr, disp_hdr, rscratch2, rscratch1, done, /*fallthough*/NULL);
+  // if the object header was the same, we're done
+  // if the object header was not the same, it is now in the hdr register
+  // => test if it is a stack pointer into the same stack (recursive locking), i.e.:
+  //
+  // 1) (hdr & aligned_mask) == 0
+  // 2) sp <= hdr
+  // 3) hdr <= sp + page_size
+  //
+  // these 3 tests can be done by evaluating the following expression:
+  //
+  // (hdr - sp) & (aligned_mask - page_size)
+  //
+  // assuming both the stack pointer and page_size have their least
+  // significant 2 bits cleared and page_size is a power of 2
+  mov(rscratch1, sp);
+  sub(hdr, hdr, rscratch1);
+  mov(rscratch2, aligned_mask - os::vm_page_size());
+  ands(hdr, hdr, rscratch2);
+  // for recursive locking, the result is zero => save it in the displaced header
+  // location (NULL in the displaced hdr location indicates recursive locking)
+  str(hdr, Address(disp_hdr, 0));
+  // otherwise we don't care about the result and handle locking via runtime call
+  cbnz(hdr, slow_case);
+  // done
+  bind(done);
+  if (PrintBiasedLockingStatistics) {
+    lea(rscratch2, ExternalAddress((address)BiasedLocking::fast_path_entry_count_addr()));
+    addmw(Address(rscratch2, 0), 1, rscratch1);
+  }
+  return null_check_offset;
+}
+
+
+void C1_MacroAssembler::unlock_object(Register hdr, Register obj, Register disp_hdr, Label& slow_case) {
+  const int aligned_mask = BytesPerWord -1;
+  const int hdr_offset = oopDesc::mark_offset_in_bytes();
+  assert(hdr != obj && hdr != disp_hdr && obj != disp_hdr, "registers must be different");
+  Label done;
+
+  if (UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+    biased_locking_exit(obj, hdr, done);
+  }
+
+  // load displaced header
+  ldr(hdr, Address(disp_hdr, 0));
+  // if the loaded hdr is NULL we had recursive locking
+  // if we had recursive locking, we are done
+  cbz(hdr, done);
+  if (!UseBiasedLocking) {
+    // load object
+    ldr(obj, Address(disp_hdr, BasicObjectLock::obj_offset_in_bytes()));
+  }
+  verify_oop(obj);
+  // test if object header is pointing to the displaced header, and if so, restore
+  // the displaced header in the object - if the object header is not pointing to
+  // the displaced header, get the object header instead
+  // if the object header was not pointing to the displaced header,
+  // we do unlocking via runtime call
+  if (hdr_offset) {
+    lea(rscratch1, Address(obj, hdr_offset));
+    cmpxchgptr(disp_hdr, hdr, rscratch1, rscratch2, done, &slow_case);
+  } else {
+    cmpxchgptr(disp_hdr, hdr, obj, rscratch2, done, &slow_case);
+  }
+  // done
+  bind(done);
+}
+
+
+// Defines obj, preserves var_size_in_bytes
+void C1_MacroAssembler::try_allocate(Register obj, Register var_size_in_bytes, int con_size_in_bytes, Register t1, Register t2, Label& slow_case) {
+  if (UseTLAB) {
+    tlab_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
+  } else {
+    eden_allocate(obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
+  }
+}
+
+void C1_MacroAssembler::initialize_header(Register obj, Register klass, Register len, Register t1, Register t2) {
+  assert_different_registers(obj, klass, len);
+  if (UseBiasedLocking && !len->is_valid()) {
+    assert_different_registers(obj, klass, len, t1, t2);
+    ldr(t1, Address(klass, Klass::prototype_header_offset()));
+  } else {
+    // This assumes that all prototype bits fit in an int32_t
+    mov(t1, (int32_t)(intptr_t)markOopDesc::prototype());
+  }
+  str(t1, Address(obj, oopDesc::mark_offset_in_bytes()));
+  str(klass, Address(obj, oopDesc::klass_offset_in_bytes()));
+
+  if (len->is_valid()) {
+    str(len, Address(obj, arrayOopDesc::length_offset_in_bytes()));
+  }
+}
+
+// preserves obj, destroys len_in_bytes
+void C1_MacroAssembler::initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1) {
+  assert(hdr_size_in_bytes >= 0, "header size must be positive or 0");
+  Label done;
+
+  // len_in_bytes is positive and ptr sized
+  subs(len_in_bytes, len_in_bytes, hdr_size_in_bytes);
+  b(done, Assembler::EQ);
+
+  // Preserve obj
+  if (hdr_size_in_bytes)
+    add(obj, obj, hdr_size_in_bytes);
+  zero_memory(obj, len_in_bytes, t1);
+  if (hdr_size_in_bytes)
+    sub(obj, obj, hdr_size_in_bytes);
+
+  bind(done);
+}
+
+
+void C1_MacroAssembler::allocate_object(Register obj, Register t1, Register t2, int header_size, int object_size, Register klass, Label& slow_case) {
+  assert_different_registers(obj, t1, t2); // XXX really?
+  assert(header_size >= 0 && object_size >= header_size, "illegal sizes");
+
+  try_allocate(obj, noreg, object_size * BytesPerWord, t1, t2, slow_case);
+
+  initialize_object(obj, klass, noreg, object_size * HeapWordSize, t1, t2, UseTLAB);
+}
+
+// This method clobbers t1, t2, and rscratch1 registers.
+void C1_MacroAssembler::initialize_object(Register obj, Register klass,
+                                          Register var_size_in_bytes,
+                                          int con_size_in_bytes,
+                                          Register t1, Register t2,
+                                          bool is_tlab_allocated) {
+  assert((con_size_in_bytes & MinObjAlignmentInBytesMask) == 0,
+         "con_size_in_bytes is not multiple of alignment");
+
+  const int hdr_size_in_bytes = instanceOopDesc::header_size() * HeapWordSize;
+
+  initialize_header(obj, klass, noreg, t1, t2);
+
+  if (!(UseTLAB && ZeroTLAB && is_tlab_allocated)) {
+    // Null out rest of allocated space
+    const Register index = t2;
+    const int threshold = 8 * BytesPerWord;
+    if (var_size_in_bytes != noreg) {
+      mov(index, var_size_in_bytes);
+      initialize_body(obj, index, hdr_size_in_bytes, t1);
+    } else if (con_size_in_bytes <= threshold) {
+      // Emit required number of str instructions (unroll loop completely)
+      mov(t1, 0);
+      for (int i = hdr_size_in_bytes; i < con_size_in_bytes; i += BytesPerWord) {
+        str(t1, Address(obj, i));
+      }
+    } else if (con_size_in_bytes > hdr_size_in_bytes) {
+      block_comment("zero memory");
+      // Use loop to null out fields
+      int words = (con_size_in_bytes - hdr_size_in_bytes) / BytesPerWord;
+      mov(t1, 0);
+
+      const int unroll = 4; // Number of str instructions we'll unroll
+      mov(index, words / unroll);
+      int remainder = words % unroll;
+      lea(rscratch1, Address(obj, hdr_size_in_bytes + remainder * BytesPerWord));
+
+      Label entry_point, loop;
+      b(entry_point);
+      bind(loop);
+      sub(index, index, 1);
+      for (int i = -unroll; i < 0; i++) {
+        if (-i == remainder) {
+          bind(entry_point);
+        }
+        str(t1, Address(rscratch1, i * BytesPerWord));
+      }
+      if (remainder == 0) {
+        bind(entry_point);
+      }
+      add(rscratch1, rscratch1, unroll * BytesPerWord);
+      cbnz(index, loop);
+    }
+  }
+
+  membar(StoreStore);
+
+  if (CURRENT_ENV->dtrace_alloc_probes()) {
+    assert(obj == r0, "must be");
+    far_call(RuntimeAddress(Runtime1::entry_for(
+             Runtime1::dtrace_object_alloc_id)));
+  }
+
+  verify_oop(obj);
+}
+
+void C1_MacroAssembler::allocate_array(Register obj, Register len, Register t1, Register t2, int header_size, int f, Register klass, Label& slow_case) {
+  assert_different_registers(obj, len, t1, t2, klass);
+
+  // determine alignment mask
+  assert(!(BytesPerWord & 1), "must be a multiple of 2 for masking code to work");
+
+  // check for negative or excessive length
+  mov(rscratch1, (int32_t)max_array_allocation_length);
+  cmp(len, rscratch1);
+  b(slow_case, Assembler::HS);
+
+  const Register arr_size = t2; // okay to be the same
+  // align object end
+  mov(arr_size, (int32_t)header_size * BytesPerWord + MinObjAlignmentInBytesMask);
+  add(arr_size, arr_size, len, Assembler::lsl(f));
+  mov(t1, ~MinObjAlignmentInBytesMask);
+  andr(arr_size, arr_size, t1);
+
+  try_allocate(obj, arr_size, 0, t1, t2, slow_case);
+
+  initialize_header(obj, klass, len, t1, t2);
+
+  // clear rest of allocated space
+  const Register len_zero = len;
+  initialize_body(obj, arr_size, header_size * BytesPerWord, len_zero);
+
+  membar(StoreStore);
+
+  if (CURRENT_ENV->dtrace_alloc_probes()) {
+    assert(obj == r0, "must be");
+    far_call(RuntimeAddress(Runtime1::entry_for(Runtime1::dtrace_object_alloc_id)));
+  }
+
+  verify_oop(obj);
+}
+
+
+void C1_MacroAssembler::inline_cache_check(Register receiver, Register iCache) {
+  verify_oop(receiver);
+  // explicit NULL check not needed since load from [klass_offset] causes a trap
+  // check against inline cache
+  assert(!MacroAssembler::needs_explicit_null_check(oopDesc::klass_offset_in_bytes()), "must add explicit null check");
+
+  cmp_klass(receiver, iCache, rscratch1);
+}
+
+void C1_MacroAssembler::build_frame(int frame_size_in_bytes,
+                                    int bang_size_in_bytes) {
+  assert(bang_size_in_bytes >= frame_size_in_bytes,
+         "stack bang size incorrect");
+
+  // If we have to make this method not-entrant, we'll overwrite its first
+  // instruction with a jump. For this action to be legal we must ensure that
+  // this first instruction is a B, BL, NOP, BKPT, or SVC. Make it a NOP
+  nop();
+
+  // Make sure there is enough stack space for this method's activation
+  generate_stack_overflow_check(bang_size_in_bytes);
+
+  // Push lr, rfp, and optionally update rfp. rfp points to the first stack
+  // word used by the new frame.
+
+  if (FrameAPCS) {
+    mov(rscratch2, sp);
+    stmdb(sp, RegSet::of(rfp, rscratch2, lr, r15_pc).bits());
+    add(rfp, sp, 3 * wordSize);
+  } else {
+    stmdb(sp, RegSet::of(rfp, lr).bits());
+    if (PreserveFramePointer) {
+      add(rfp, sp, BytesPerWord);
+    }
+  }
+
+  // Create frame. frame_size_in_bytes always comes from
+  // LIR_Assembler::initial_frame_size_in_bytes() method, and it already
+  // takes into account two stack words spent on saving lr and rfp.
+  decrement(sp, frame_size_in_bytes);
+}
+
+void C1_MacroAssembler::remove_frame(int frame_size_in_bytes) {
+  if (FrameAPCS) {
+    ldmea(rfp, RegSet::of(rfp, sp, lr).bits(), false/*wb*/);
+  } else {
+    // Remove frame. frame_size_in_bytes always comes from
+    // LIR_Assembler::initial_frame_size_in_bytes() method, and it already
+    // takes into account two stack words spent on saving lr and rfp.
+    increment(sp, frame_size_in_bytes);
+    // Pop rfp and lr
+    ldmia(sp, RegSet::of(rfp, lr).bits());
+  }
+}
+
+void C1_MacroAssembler::verified_entry() {
+}
+
+void C1_MacroAssembler::patchable_load(Register reg, address addr) {
+  nop();
+  membar(Assembler::LoadLoad);
+  far_load(reg, addr);
+}
+
+void C1_MacroAssembler::load_parameter(int offset_in_words, Register reg) {
+  ////   Not APCS
+  //     - 1: link
+  // fp    0: return address
+  //     + 1: argument with offset 0
+  //     + 2: argument with offset 1
+  //     + 3: ...
+  ////   APCS
+  //     - 3: link
+  //     - 2: sp
+  //     - 1: return address
+  // fp    0: pc
+  //     + 1: argument with offset 0
+  //     + 2: argument with offset 1
+  //     + 3: ...
+
+  ldr(reg, Address(rfp, (offset_in_words + 1) * BytesPerWord));
+}
+
+#ifndef PRODUCT
+
+void C1_MacroAssembler::verify_stack_oop(int stack_offset) {
+  if (!VerifyOops) return;
+  verify_oop_addr(Address(sp, stack_offset), "oop");
+}
+
+void C1_MacroAssembler::verify_not_null_oop(Register r) {
+  if (!VerifyOops) return;
+  Label not_null;
+  cbnz(r, not_null);
+  stop("non-null oop required");
+  bind(not_null);
+  verify_oop(r);
+}
+
+void C1_MacroAssembler::invalidate_registers(bool inv_r0, bool inv_r2, bool inv_r3) {
+#ifdef ASSERT
+  static int nn;
+  if (inv_r0) mov(r0, 0xDEAD);
+  if (inv_r2) mov(r2, nn++);
+  if (inv_r3) mov(r3, 0xDEAD);
+#endif
+}
+#endif // ifndef PRODUCT
--- /dev/null	2018-09-25 19:24:31.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_MacroAssembler_aarch32.hpp	2018-09-25 19:24:30.000000000 +0300
@@ -0,0 +1,116 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
+
+//TODO: XXX: merge
+//using MacroAssembler::build_frame;
+using MacroAssembler::null_check;
+
+// C1_MacroAssembler contains high-level macros for C1
+
+ private:
+  int _rsp_offset;    // track rsp changes
+  // initialization
+  void pd_init() { _rsp_offset = 0; }
+
+ public:
+  void try_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+
+  void initialize_header(Register obj, Register klass, Register len, Register t1, Register t2);
+  void initialize_body(Register obj, Register len_in_bytes, int hdr_size_in_bytes, Register t1);
+
+  void float_cmp(bool is_float, int unordered_result,
+                 FloatRegister f0, FloatRegister f1,
+                 Register result);
+
+  // locking
+  // hdr     : must be r0, contents destroyed
+  // obj     : must point to the object to lock, contents preserved
+  // disp_hdr: must point to the displaced header location, contents preserved
+  // scratch : scratch register, contents destroyed
+  // returns code offset at which to add null check debug information
+  int lock_object  (Register swap, Register obj, Register disp_hdr, Register scratch, Label& slow_case);
+
+  // unlocking
+  // hdr     : contents destroyed
+  // obj     : must point to the object to lock, contents preserved
+  // disp_hdr: must be r0 & must point to the displaced header location, contents destroyed
+  void unlock_object(Register swap, Register obj, Register lock, Label& slow_case);
+
+  void initialize_object(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register klass,                    // object klass
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    bool     is_tlab_allocated         // the object was allocated in a TLAB; relevant for the implementation of ZeroTLAB
+  );
+
+  // allocation of fixed-size objects
+  // (can also be used to allocate fixed-size arrays, by setting
+  // hdr_size correctly and storing the array length afterwards)
+  // obj        : will contain pointer to allocated object
+  // t1, t2     : scratch registers - contents destroyed
+  // header_size: size of object header in words
+  // object_size: total size of object in words
+  // slow_case  : exit to slow case implementation if fast allocation fails
+  void allocate_object(Register obj, Register t1, Register t2, int header_size, int object_size, Register klass, Label& slow_case);
+
+  enum {
+    max_array_allocation_length = 0x00FFFFFF
+  };
+
+  // allocation of arrays
+  // obj        : will contain pointer to allocated object
+  // len        : array length in number of elements
+  // t          : scratch register - contents destroyed
+  // header_size: size of object header in words
+  // f          : element scale factor
+  // slow_case  : exit to slow case implementation if fast allocation fails
+  void allocate_array(Register obj, Register len, Register t, Register t2, int header_size, int f, Register klass, Label& slow_case);
+
+  int  rsp_offset() const { return _rsp_offset; }
+  void set_rsp_offset(int n) { _rsp_offset = n; }
+
+  void invalidate_registers(bool inv_r0, bool inv_r2, bool inv_r3) PRODUCT_RETURN;
+
+  void patchable_load(Register reg, address addr);
+  // This platform only uses signal-based null checks. The Label is not needed.
+  void null_check(Register r, Label *Lnull = NULL) { MacroAssembler::null_check(r); }
+
+  void load_parameter(int offset_in_words, Register reg);
+
+#endif // CPU_AARCH32_VM_C1_MACROASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:32.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_Runtime1_aarch32.cpp	2018-09-25 19:24:31.000000000 +0300
@@ -0,0 +1,1154 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "c1/c1_CodeStubs.hpp"
+#include "c1/c1_Defs.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "c1/c1_Runtime1.hpp"
+#include "compiler/disassembler.hpp"
+#include "gc/shared/cardTable.hpp"
+#include "gc/shared/cardTableBarrierSet.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "register_aarch32.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/vframe.hpp"
+#include "runtime/vframeArray.hpp"
+#include "vmreg_aarch32.inline.hpp"
+
+// Implementation of StubAssembler
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, int args_size) {
+  // setup registers
+  assert(!(oop_result1->is_valid() || metadata_result->is_valid()) || oop_result1 != metadata_result, "registers must be different");
+  assert(oop_result1 != rthread && metadata_result != rthread, "registers must be different");
+  assert(args_size >= 0, "illegal args_size");
+
+  mov(c_rarg0, rthread);
+  set_num_rt_args(0); // Nothing on stack
+
+  Label retaddr;
+  set_last_Java_frame(sp, rfp, retaddr, rscratch1);
+
+  // do the call
+  lea(rscratch1, RuntimeAddress(entry));
+  bl(rscratch1);
+  bind(retaddr);
+  int call_offset = offset();
+  // verify callee-saved register
+#ifdef ASSERT
+  push(r0, sp);
+  { Label L;
+    get_thread(r0);
+    cmp(rthread, r0);
+    b(L, Assembler::EQ);
+    stop("StubAssembler::call_RT: rthread not callee saved?");
+    bind(L);
+  }
+  pop(r0, sp);
+#endif
+  reset_last_Java_frame(true);
+  maybe_isb();
+
+  // check for pending exceptions
+  { Label L;
+    // check for pending exceptions (java_thread is set upon return)
+    ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    cbz(rscratch1, L);
+    mov(rscratch1, 0);
+    // exception pending => remove activation and forward to exception handler
+    // make sure that the vm_results are cleared
+    if (oop_result1->is_valid()) {
+      str(rscratch1, Address(rthread, JavaThread::vm_result_offset()));
+    }
+    if (metadata_result->is_valid()) {
+      str(rscratch1, Address(rthread, JavaThread::vm_result_2_offset()));
+    }
+    if (frame_size() == no_frame_size) {
+      leave();
+      far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+    } else if (_stub_id == Runtime1::forward_exception_id) {
+      should_not_reach_here();
+    } else {
+      far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::forward_exception_id)));
+    }
+    bind(L);
+  }
+  // get oop results if there are any and reset the values in the thread
+  if (oop_result1->is_valid()) {
+    get_vm_result(oop_result1, rthread);
+  }
+  if (metadata_result->is_valid()) {
+    get_vm_result_2(metadata_result, rthread);
+  }
+  return call_offset;
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1) {
+  mov(c_rarg1, arg1);
+  return call_RT(oop_result1, metadata_result, entry, 1);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2) {
+  if (c_rarg1 == arg2) {
+    if (c_rarg2 == arg1) {
+      mov(rscratch1, arg1);
+      mov(arg1, arg2);
+      mov(arg2, rscratch1);
+    } else {
+      mov(c_rarg2, arg2);
+      mov(c_rarg1, arg1);
+    }
+  } else {
+    mov(c_rarg1, arg1);
+    mov(c_rarg2, arg2);
+  }
+  return call_RT(oop_result1, metadata_result, entry, 2);
+}
+
+
+int StubAssembler::call_RT(Register oop_result1, Register metadata_result, address entry, Register arg1, Register arg2, Register arg3) {
+  // if there is any conflict use the stack
+  if (arg1 == c_rarg2 || arg1 == c_rarg3 ||
+      arg2 == c_rarg1 || arg2 == c_rarg3 ||
+      arg3 == c_rarg1 || arg3 == c_rarg2) {
+    push(arg2);
+    push(arg3);
+    push(arg1);
+    pop(c_rarg1);
+    pop(c_rarg3);
+    pop(c_rarg2);
+  } else {
+    mov(c_rarg1, arg1);
+    mov(c_rarg2, arg2);
+    mov(c_rarg3, arg3);
+  }
+  return call_RT(oop_result1, metadata_result, entry, 3);
+}
+
+// Implementation of StubFrame
+
+class StubFrame: public StackObj {
+ private:
+  StubAssembler* _sasm;
+
+ public:
+  StubFrame(StubAssembler* sasm, const char* name, bool must_gc_arguments);
+  void load_argument(int offset_in_words, Register reg);
+
+  ~StubFrame();
+};;
+
+void StubAssembler::prologue(const char* name, bool must_gc_arguments) {
+  set_info(name, must_gc_arguments);
+  enter();
+}
+
+void StubAssembler::epilogue() {
+  leave();
+  ret(lr);
+}
+
+#define __ _sasm->
+
+StubFrame::StubFrame(StubAssembler* sasm, const char* name, bool must_gc_arguments) {
+  _sasm = sasm;
+  __ prologue(name, must_gc_arguments);
+}
+
+// load parameters that were stored with LIR_Assembler::store_parameter
+// Note: offsets for store_parameter and load_argument must match
+void StubFrame::load_argument(int offset_in_words, Register reg) {
+  __ load_parameter(offset_in_words, reg);
+}
+
+
+StubFrame::~StubFrame() {
+  __ epilogue();
+}
+
+#undef __
+
+
+// Implementation of Runtime1
+
+#define __ sasm->
+
+
+// Stack layout for saving/restoring  all the registers needed during a runtime
+// call (this includes deoptimization)
+// Note: note that users of this frame may well have arguments to some runtime
+// while these values are on the stack. These positions neglect those arguments
+// but the code in save_live_registers will take the argument count into
+// account.
+//
+
+enum reg_save_layout {
+  reg_save_s0,
+  reg_save_s31 = reg_save_s0 + FrameMap::nof_fpu_regs - 1,
+  reg_save_pad, // to align to doubleword to simplify conformance to APCS
+  reg_save_r0,
+  reg_save_r1,
+  reg_save_r2,
+  reg_save_r3,
+  reg_save_r4,
+  reg_save_r5,
+  reg_save_r6,
+  reg_save_r7,
+  reg_save_r8,
+  reg_save_r9,
+  reg_save_r10,
+  reg_save_r11,
+  reg_save_r12,
+  reg_save_frame_size
+  // remaining words pushed by enter
+};
+
+// Save off registers which might be killed by calls into the runtime.
+// Tries to smart of about FP registers.  In particular we separate
+// saving and describing the FPU registers for deoptimization since we
+// have to save the FPU registers twice if we describe them.  The
+// deopt blob is the only thing which needs to describe FPU registers.
+// In all other cases it should be sufficient to simply save their
+// current value.
+
+static int cpu_reg_save_offsets[FrameMap::nof_cpu_regs];
+static int fpu_reg_save_offsets[FrameMap::nof_fpu_regs];
+static int reg_save_size_in_words;
+static int frame_size_in_bytes = -1;
+
+static OopMap* generate_oop_map(StubAssembler* sasm, bool save_fpu_registers) {
+  int frame_size_in_bytes = (reg_save_frame_size + frame::get_frame_size()) * BytesPerWord;
+  sasm->set_frame_size(frame_size_in_bytes / BytesPerWord);
+  int frame_size_in_slots = frame_size_in_bytes / sizeof(jint);
+  OopMap* oop_map = new OopMap(frame_size_in_slots, 0);
+
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r0), r0->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r1), r1->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r2), r2->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r3), r3->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r4), r4->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r5), r5->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r6), r6->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r7), r7->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r8), r8->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r9), r9->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r10), r10->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r11), r11->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_r12), r12->as_VMReg());
+  if (hasFPU()) {
+  for (int i = 0; i < FrameMap::nof_fpu_regs; ++i) {
+    oop_map->set_callee_saved(VMRegImpl::stack2reg(reg_save_s0 + i), as_FloatRegister(i)->as_VMReg());
+  }
+  }
+
+  return oop_map;
+}
+
+static OopMap* save_live_registers(StubAssembler* sasm,
+                                   bool save_fpu_registers = true) {
+  __ block_comment("save_live_registers");
+
+  __ push(RegSet::range(r0, r12), sp);         // integer registers except lr & sp
+  __ sub(sp, sp, 4);                           // align to 8 bytes
+
+  if (save_fpu_registers && hasFPU()) {
+    __ vstmdb_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ sub(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  return generate_oop_map(sasm, save_fpu_registers);
+}
+
+static void restore_live_registers(StubAssembler* sasm, bool restore_fpu_registers = true) {
+
+  if (restore_fpu_registers  && hasFPU()) {
+    __ vldmia_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ add(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  __ add(sp, sp, 4);
+  __ pop(RegSet::range(r0, r12), sp);
+}
+
+static void restore_live_registers_except_r0(StubAssembler* sasm, bool restore_fpu_registers = true)  {
+
+  if (restore_fpu_registers  && hasFPU()) {
+    __ vldmia_f64(sp, (1 << FrameMap::nof_fpu_regs / 2) - 1);
+  } else {
+    __ add(sp, sp, FrameMap::nof_fpu_regs * 4);
+  }
+
+  __ add(sp, sp, 8);
+  __ pop(RegSet::range(r1, r12), sp);
+}
+
+void Runtime1::initialize_pd() {
+}
+
+// target: the entry point of the method that creates and posts the exception oop
+// has_argument: true if the exception needs arguments (passed in rscratch1 and rscratch2)
+
+OopMapSet* Runtime1::generate_exception_throw(StubAssembler* sasm, address target, bool has_argument) {
+  // make a frame and preserve the caller's caller-save registers
+  OopMap* oop_map = save_live_registers(sasm);
+  int call_offset;
+  if (!has_argument) {
+    call_offset = __ call_RT(noreg, noreg, target);
+  } else {
+    call_offset = __ call_RT(noreg, noreg, target, rscratch1, rscratch2);
+  }
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  __ should_not_reach_here();
+  return oop_maps;
+}
+
+
+OopMapSet* Runtime1::generate_handle_exception(StubID id, StubAssembler *sasm) {
+  __ block_comment("generate_handle_exception");
+
+  // incoming parameters
+  const Register exception_oop = r0;
+  const Register exception_pc  = r3;
+  // other registers used in this stub
+
+  // Save registers, if required.
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* oop_map = NULL;
+  switch (id) {
+  case forward_exception_id:
+    // We're handling an exception in the context of a compiled frame.
+    // The registers have been saved in the standard places.  Perform
+    // an exception lookup in the caller and dispatch to the handler
+    // if found.  Otherwise unwind and dispatch to the callers
+    // exception handler.
+    oop_map = generate_oop_map(sasm, 1 /*thread*/);
+    __ mov(rscratch1, 0);
+
+    // load and clear pending exception oop into r0
+    __ ldr(exception_oop, Address(rthread, Thread::pending_exception_offset()));
+    __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+
+    // load issuing PC (the return address for this stub) into r3
+    __ ldr(exception_pc, Address(rfp, wordSize * frame::get_return_addr_offset()));
+
+    // make sure that the vm_results are cleared (may be unnecessary)
+    __ str(rscratch1, Address(rthread, JavaThread::vm_result_offset()));
+    __ str(rscratch1, Address(rthread, JavaThread::vm_result_2_offset()));
+    break;
+  case handle_exception_nofpu_id:
+  case handle_exception_id:
+    // At this point all registers MAY be live.
+    oop_map = save_live_registers(sasm, id != handle_exception_nofpu_id);
+    break;
+  case handle_exception_from_callee_id: {
+    // At this point all registers except exception oop (r0) and
+    // exception pc (lr) are dead.
+    const int frame_size = frame::get_frame_size() /*fp, return address, ...*/;
+    assert(frame_size*wordSize % StackAlignmentInBytes == 0, "must be");
+    oop_map = new OopMap(frame_size * VMRegImpl::slots_per_word, 0);
+    sasm->set_frame_size(frame_size);
+    break;
+  }
+  default:
+    __ should_not_reach_here();
+    break;
+  }
+
+  // verify that only r0 and r3 are valid at this time
+  __ invalidate_registers(false, true, false);
+  // verify that r0 contains a valid exception
+  __ verify_not_null_oop(exception_oop);
+
+#ifdef ASSERT
+  // check that fields in JavaThread for exception oop and issuing pc are
+  // empty before writing to them
+  Label oop_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ cbz(rscratch1, oop_empty);
+  __ stop("exception oop already set");
+  __ bind(oop_empty);
+
+  Label pc_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+  __ cbz(rscratch1, pc_empty);
+  __ stop("exception pc already set");
+  __ bind(pc_empty);
+#endif
+
+  // save exception oop and issuing pc into JavaThread
+  // (exception handler will load it from here)
+  __ str(exception_oop, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(exception_pc, Address(rthread, JavaThread::exception_pc_offset()));
+
+  // patch throwing pc into return address (has bci & oop map)
+  __ str(exception_pc, Address(rfp, wordSize * frame::get_return_addr_offset()));
+
+  // compute the exception handler.
+  // the exception oop and the throwing pc are read from the fields in JavaThread
+  int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, exception_handler_for_pc));
+  oop_maps->add_gc_map(call_offset, oop_map);
+
+  // r0: handler address
+  //      will be the deopt blob if nmethod was deoptimized while we looked up
+  //      handler regardless of whether handler existed in the nmethod.
+
+  // only r0 is valid at this time, all other registers have been destroyed by the runtime call
+  __ invalidate_registers(false, true, true);
+
+  // patch the return address, this stub will directly return to the exception handler
+  __ str(r0, Address(rfp, wordSize * frame::get_return_addr_offset()));
+
+  switch (id) {
+  case forward_exception_id:
+  case handle_exception_nofpu_id:
+  case handle_exception_id:
+    // Restore the registers that were saved at the beginning.
+    restore_live_registers(sasm, id != handle_exception_nofpu_id);
+    break;
+  case handle_exception_from_callee_id:
+    // Pop the return address.
+    __ leave();
+    __ ret(lr);  // jump to exception handler
+    break;
+  default:  ShouldNotReachHere();
+  }
+
+  return oop_maps;
+}
+
+
+void Runtime1::generate_unwind_exception(StubAssembler *sasm) {
+  // incoming parameters
+  const Register exception_oop = r0;
+  // other registers used in this stub
+  const Register exception_pc = r3;
+  const Register handler_addr = r1;
+
+  // verify that only r0, is valid at this time
+  __ invalidate_registers(false, true, true);
+
+#ifdef ASSERT
+  // check that fields in JavaThread for exception oop and issuing pc are empty
+  Label oop_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ cbz(rscratch1, oop_empty);
+  __ stop("exception oop must be empty");
+  __ bind(oop_empty);
+
+  Label pc_empty;
+  __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+  __ cbz(rscratch1, pc_empty);
+  __ stop("exception pc must be empty");
+  __ bind(pc_empty);
+#endif
+
+  // Save our return address because
+  // exception_handler_for_return_address will destroy it.  We also
+  // save exception_oop
+  __ push(exception_oop);
+  __ push(lr);
+
+  // search the exception handler address of the caller (using the return address)
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), rthread, lr);
+  // r0: exception handler address of the caller
+
+  // Only R0 is valid at this time; all other registers have been
+  // destroyed by the call.
+  __ invalidate_registers(false, true, true);
+
+  // move result of call into correct register
+  __ mov(handler_addr, r0);
+
+  // get throwing pc (= return address).
+  // lr has been destroyed by the call
+  __ pop(lr);
+  __ pop(exception_oop);
+  __ mov(r3, lr);
+
+  __ verify_not_null_oop(exception_oop);
+
+  // continue at exception handler (return address removed)
+  // note: do *not* remove arguments when unwinding the
+  //       activation since the caller assumes having
+  //       all arguments on the stack when entering the
+  //       runtime to determine the exception handler
+  //       (GC happens at call site with arguments!)
+  // r0: exception oop
+  // r3: throwing pc
+  // r1: exception handler
+  __ b(handler_addr);
+}
+
+
+
+OopMapSet* Runtime1::generate_patching(StubAssembler* sasm, address target) {
+  // use the maximum number of runtime-arguments here because it is difficult to
+  // distinguish each RT-Call.
+  // Note: This number affects also the RT-Call in generate_handle_exception because
+  //       the oop-map is shared for all calls.
+  DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+  assert(deopt_blob != NULL, "deoptimization blob must have been created");
+
+  OopMap* oop_map = save_live_registers(sasm);
+
+  __ mov(c_rarg0, rthread);
+  Label retaddr;
+  __ set_last_Java_frame(sp, rfp, retaddr, rscratch1);
+  // do the call
+  __ lea(rscratch1, RuntimeAddress(target));
+  __ bl(rscratch1);
+  __ bind(retaddr);
+  OopMapSet* oop_maps = new OopMapSet();
+  oop_maps->add_gc_map(__ offset(), oop_map);
+  // verify callee-saved register
+#ifdef ASSERT
+  { Label L;
+    __ get_thread(rscratch1);
+    __ cmp(rthread, rscratch1);
+    __ b(L, Assembler::EQ);
+    __ stop("StubAssembler::call_RT: rthread not callee saved?");
+    __ bind(L);
+  }
+#endif
+  __ reset_last_Java_frame(true);
+  __ maybe_isb();
+
+  // check for pending exceptions
+  { Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    // exception pending => remove activation and forward to exception handler
+
+    { Label L1;
+      __ cbnz(r0, L1);                                  // have we deoptimized?
+      __ far_jump(RuntimeAddress(Runtime1::entry_for(Runtime1::forward_exception_id)));
+      __ bind(L1);
+    }
+
+    // the deopt blob expects exceptions in the special fields of
+    // JavaThread, so copy and clear pending exception.
+
+    // load and clear pending exception
+    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ mov(rscratch1, 0);
+    __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+
+    // check that there is really a valid exception
+    __ verify_not_null_oop(r0);
+
+    // load throwing pc: this is the return address of the stub
+    __ ldr(r3, Address(rfp, wordSize * frame::get_return_addr_offset()));
+
+#ifdef ASSERT
+    // check that fields in JavaThread for exception oop and issuing pc are empty
+    Label oop_empty;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, oop_empty);
+    __ stop("exception oop must be empty");
+    __ bind(oop_empty);
+
+    Label pc_empty;
+    __ ldr(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+    __ cbz(rscratch1, pc_empty);
+    __ stop("exception pc must be empty");
+    __ bind(pc_empty);
+#endif
+
+    // store exception oop and throwing pc to JavaThread
+    __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+    __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+
+    restore_live_registers(sasm);
+
+    __ leave();
+
+    // Forward the exception directly to deopt blob. We can blow no
+    // registers and must leave throwing pc on the stack.  A patch may
+    // have values live in registers so the entry point with the
+    // exception in tls.
+    __ far_jump(RuntimeAddress(deopt_blob->unpack_with_exception_in_tls()));
+
+    __ bind(L);
+  }
+
+
+  // Runtime will return true if the nmethod has been deoptimized during
+  // the patching process. In that case we must do a deopt reexecute instead.
+
+  Label reexecuteEntry, cont;
+
+  __ cbz(r0, cont);                                 // have we deoptimized?
+
+  // Will reexecute. Proper return address is already on the stack we just restore
+  // registers, pop all of our frame but the return address and jump to the deopt blob
+  restore_live_registers(sasm);
+  __ leave();
+  __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+
+  __ bind(cont);
+  restore_live_registers(sasm);
+  __ leave();
+  __ ret(lr);
+
+  return oop_maps;
+}
+
+
+OopMapSet* Runtime1::generate_code_for(StubID id, StubAssembler* sasm) {
+
+  const Register exception_oop = r0;
+  const Register exception_pc  = r3;
+
+  // for better readability
+  const bool must_gc_arguments = true;
+  const bool dont_gc_arguments = false;
+
+  // default value; overwritten for some optimized stubs that are called from methods that do not use the fpu
+  bool save_fpu_registers = true;
+
+  // stub code & info for the different stubs
+  OopMapSet* oop_maps = NULL;
+  OopMap* oop_map = NULL;
+  switch (id) {
+    {
+    case forward_exception_id:
+      {
+        oop_maps = generate_handle_exception(id, sasm);
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case throw_div0_exception_id:
+      { StubFrame f(sasm, "throw_div0_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_div0_exception), false);
+      }
+      break;
+
+    case throw_null_pointer_exception_id:
+      { StubFrame f(sasm, "throw_null_pointer_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_null_pointer_exception), false);
+      }
+      break;
+
+    case new_instance_id:
+    case fast_new_instance_id:
+    case fast_new_instance_init_check_id:
+      {
+        Register klass = r3; // Incoming
+        Register obj   = r0; // Result
+
+        if (id == new_instance_id) {
+          __ set_info("new_instance", dont_gc_arguments);
+        } else if (id == fast_new_instance_id) {
+          __ set_info("fast new_instance", dont_gc_arguments);
+        } else {
+          assert(id == fast_new_instance_init_check_id, "bad StubID");
+          __ set_info("fast new_instance init check", dont_gc_arguments);
+        }
+
+        // If TLAB is disabled, see if there is support for inlining contiguous
+        // allocations.
+        // Otherwise, just go to the slow path.
+        if ((id == fast_new_instance_id || id == fast_new_instance_init_check_id) &&
+            !UseTLAB && Universe::heap()->supports_inline_contig_alloc()) {
+          Label slow_path;
+          Register obj_size = r2;
+          Register t1       = r5;
+          Register t2       = r4;
+          assert_different_registers(klass, obj, obj_size, t1, t2);
+
+          __ push(t1);
+          __ push(t2);
+          if (id == fast_new_instance_init_check_id) {
+            // make sure the klass is initialized
+            __ ldrb(rscratch1, Address(klass, InstanceKlass::init_state_offset()));
+            __ cmp(rscratch1, InstanceKlass::fully_initialized);
+            __ b(slow_path, Assembler::NE);
+          }
+
+#ifdef ASSERT
+          // assert object can be fast path allocated
+          {
+            Label ok, not_ok;
+            __ ldr(obj_size, Address(klass, Klass::layout_helper_offset()));
+            __ cmp(obj_size, 0u);
+            __ b(not_ok, Assembler::LE); // Make sure it's an instance (layout helper is positive)
+            __ tst(obj_size, Klass::_lh_instance_slow_path_bit);
+            __ b(ok, Assembler::EQ);
+            __ bind(not_ok);
+            __ stop("assert(can be fast path allocated)");
+            __ should_not_reach_here();
+            __ bind(ok);
+          }
+#endif // ASSERT
+
+          // get the instance size
+          __ ldr(obj_size, Address(klass, Klass::layout_helper_offset()));
+
+          __ eden_allocate(obj, obj_size, 0, t1, slow_path);
+
+          __ initialize_object(obj, klass, obj_size, 0, t1, t2, /* is_tlab_allocated */ false);
+          __ verify_oop(obj);
+          __ pop(t2);
+          __ pop(t1);
+          __ ret(lr);
+
+          __ bind(slow_path);
+          __ pop(t2);
+          __ pop(t1);
+        }
+
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        int call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_instance), klass);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+        __ verify_oop(obj);
+        __ leave();
+        __ ret(lr);
+
+        // r0,: new instance
+      }
+
+      break;
+
+    case counter_overflow_id:
+      {
+        Register bci = r0, method = r1;
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        // Retrieve bci
+        __ ldr(bci, Address(rfp, 1*BytesPerWord));
+        // And a pointer to the Method*
+        __ ldr(method, Address(rfp, 2*BytesPerWord));
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, counter_overflow), bci, method);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm);
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case new_type_array_id:
+    case new_object_array_id:
+      {
+        Register length   = r6; // Incoming
+        Register klass    = r3; // Incoming
+        Register obj      = r0; // Result
+
+        if (id == new_type_array_id) {
+          __ set_info("new_type_array", dont_gc_arguments);
+        } else {
+          __ set_info("new_object_array", dont_gc_arguments);
+        }
+
+#ifdef ASSERT
+        // assert object type is really an array of the proper kind
+        {
+          Label ok;
+          Register t0 = obj;
+          __ ldr(t0, Address(klass, Klass::layout_helper_offset()));
+          __ asr(t0, t0, Klass::_lh_array_tag_shift);
+          int tag = ((id == new_type_array_id)
+                     ? Klass::_lh_array_tag_type_value
+                     : Klass::_lh_array_tag_obj_value);
+          __ mov(rscratch1, tag);
+          __ cmp(t0, rscratch1);
+          __ b(ok, Assembler::EQ);
+          __ stop("assert(is an array klass)");
+          __ should_not_reach_here();
+          __ bind(ok);
+        }
+#endif // ASSERT
+
+        // If TLAB is disabled, see if there is support for inlining contiguous
+        // allocations.
+        // Otherwise, just go to the slow path.
+        if (!UseTLAB && Universe::heap()->supports_inline_contig_alloc()) {
+          Register arr_size = r4;
+          Register t1       = r2;
+          Register t2       = r5;
+          Label slow_path;
+          assert_different_registers(length, klass, obj, arr_size, t1, t2);
+
+          // check that array length is small enough for fast path.
+          __ mov(rscratch1, C1_MacroAssembler::max_array_allocation_length);
+          __ cmp(length, rscratch1);
+          __ b(slow_path, Assembler::HI);
+
+          // get the allocation size: round_up(hdr + length << (layout_helper & 0x1F))
+          __ ldr(t1, Address(klass, Klass::layout_helper_offset()));
+          __ andr(rscratch1, t1, 0x1f);
+          __ lsl(arr_size, length, rscratch1);
+          __ extract_bits(t1, t1, Klass::_lh_header_size_shift,
+                  exact_log2(Klass::_lh_header_size_mask + 1));
+          __ add(arr_size, arr_size, t1);
+          __ add(arr_size, arr_size, MinObjAlignmentInBytesMask); // align up
+          __ mov(rscratch1, ~MinObjAlignmentInBytesMask);
+          __ andr(arr_size, arr_size, rscratch1);
+
+          __ eden_allocate(obj, arr_size, 0, t1, slow_path);  // preserves arr_size
+
+          __ initialize_header(obj, klass, length, t1, t2);
+          // Assume Little-Endian
+          __ ldrb(t1, Address(klass, in_bytes(Klass::layout_helper_offset()) + (Klass::_lh_header_size_shift / BitsPerByte)));
+          assert(Klass::_lh_header_size_shift % BitsPerByte == 0, "bytewise");
+          assert(Klass::_lh_header_size_mask <= 0xFF, "bytewise");
+          __ andr(t1, t1, Klass::_lh_header_size_mask);
+          __ sub(arr_size, arr_size, t1);  // body length
+          __ add(t1, t1, obj);       // body start
+          __ initialize_body(t1, arr_size, 0, t2);
+          __ verify_oop(obj);
+
+          __ ret(lr);
+
+          __ bind(slow_path);
+        }
+
+        __ enter();
+        OopMap* map = save_live_registers(sasm);
+        int call_offset;
+        if (id == new_type_array_id) {
+          call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_type_array), klass, length);
+        } else {
+          call_offset = __ call_RT(obj, noreg, CAST_FROM_FN_PTR(address, new_object_array), klass, length);
+        }
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+
+        __ verify_oop(obj);
+        __ leave();
+        __ ret(lr);
+
+        // r0: new array
+      }
+      break;
+
+    case new_multi_array_id:
+      { StubFrame f(sasm, "new_multi_array", dont_gc_arguments);
+        // r1: klass
+        // r2: rank
+        // r3: address of 1st dimension
+        OopMap* map = save_live_registers(sasm);
+        int call_offset = __ call_RT(r0, noreg, CAST_FROM_FN_PTR(address, new_multi_array), r1, r2, r3);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers_except_r0(sasm);
+
+        // r0,: new multi array
+        __ verify_oop(r0);
+      }
+      break;
+
+    case register_finalizer_id:
+      {
+        __ set_info("register_finalizer", dont_gc_arguments);
+
+        // This is called via call_runtime so the arguments
+        // will be place in C abi locations
+
+        __ verify_oop(c_rarg0);
+
+        // load the klass and check the has finalizer flag
+        Label register_finalizer;
+        Register t = r5;
+        __ load_klass(t, r0);
+        __ ldr(t, Address(t, Klass::access_flags_offset()));
+        __ tst(t, JVM_ACC_HAS_FINALIZER);
+        __ b(register_finalizer, Assembler::NE);
+        __ ret(lr);
+
+        __ bind(register_finalizer);
+        __ enter();
+        OopMap* oop_map = save_live_registers(sasm);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, SharedRuntime::register_finalizer), r0);
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+
+        // Now restore all the live registers
+        restore_live_registers(sasm);
+
+        __ leave();
+        __ ret(lr);
+      }
+      break;
+
+    case throw_class_cast_exception_id:
+      { StubFrame f(sasm, "throw_class_cast_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_class_cast_exception), true);
+      }
+      break;
+
+    case throw_incompatible_class_change_error_id:
+      { StubFrame f(sasm, "throw_incompatible_class_cast_exception", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_incompatible_class_change_error), false);
+      }
+      break;
+
+    case slow_subtype_check_id:
+      {
+        // Typical calling sequence:
+        // __ push(klass_RInfo);  // object klass or other subclass
+        // __ push(sup_k_RInfo);  // array element klass or other superclass
+        // __ bl(slow_subtype_check);
+        // Note that the subclass is pushed first, and is therefore deepest.
+        enum layout {
+          r0_off,
+          r2_off,
+          r4_off,
+          r5_off,
+          sup_k_off,
+          klass_off,
+          framesize,
+          result_off = sup_k_off
+        };
+
+        __ set_info("slow_subtype_check", dont_gc_arguments);
+        __ push(RegSet::of(r0, r2, r4, r5), sp);
+
+        // This is called by pushing args and not with C abi
+        __ ldr(r4, Address(sp, (klass_off) * VMRegImpl::stack_slot_size)); // subclass
+        __ ldr(r0, Address(sp, (sup_k_off) * VMRegImpl::stack_slot_size)); // superclass
+
+
+        Label miss;
+        __ check_klass_subtype_slow_path(r4, r0, r2, r5, NULL, &miss);
+
+        // fallthrough on success:
+        __ mov(rscratch1, 1);
+        __ str(rscratch1, Address(sp, (result_off) * VMRegImpl::stack_slot_size)); // result
+        __ pop(RegSet::of(r0, r2, r4, r5), sp);
+        __ ret(lr);
+
+        __ bind(miss);
+        __ mov(rscratch1, 0);
+        __ str(rscratch1, Address(sp, (result_off) * VMRegImpl::stack_slot_size)); // result
+        __ pop(RegSet::of(r0, r2, r4, r5), sp);
+        __ ret(lr);
+      }
+      break;
+
+    case monitorenter_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorenter_id:
+      {
+        StubFrame f(sasm, "monitorenter", dont_gc_arguments);
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+
+        // Called with store_parameter and not C abi
+
+        f.load_argument(1, r0); // r0,: object
+        f.load_argument(0, r1); // r1,: lock address
+
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorenter), r0, r1);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case monitorexit_nofpu_id:
+      save_fpu_registers = false;
+      // fall through
+    case monitorexit_id:
+      {
+        StubFrame f(sasm, "monitorexit", dont_gc_arguments);
+        OopMap* map = save_live_registers(sasm, save_fpu_registers);
+
+        // Called with store_parameter and not C abi
+
+        f.load_argument(0, r0); // r0,: lock address
+
+        // note: really a leaf routine but must setup last java sp
+        //       => use call_RT for now (speed can be improved by
+        //       doing last java sp setup manually)
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, monitorexit), r0);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm, save_fpu_registers);
+      }
+      break;
+
+    case deoptimize_id:
+      {
+        StubFrame f(sasm, "deoptimize", dont_gc_arguments);
+        OopMap* oop_map = save_live_registers(sasm);
+        f.load_argument(0, c_rarg1);
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, deoptimize), c_rarg1);
+
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, oop_map);
+        restore_live_registers(sasm);
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+        __ leave();
+        __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+      }
+      break;
+
+    case throw_range_check_failed_id:
+      { StubFrame f(sasm, "range_check_failed", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_range_check_exception), true);
+      }
+      break;
+
+    case unwind_exception_id:
+      { __ set_info("unwind_exception", dont_gc_arguments);
+        // note: no stubframe since we are about to leave the current
+        //       activation and we are calling a leaf VM function only.
+        generate_unwind_exception(sasm);
+      }
+      break;
+
+    case access_field_patching_id:
+      { StubFrame f(sasm, "access_field_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, access_field_patching));
+      }
+      break;
+
+    case load_klass_patching_id:
+      { StubFrame f(sasm, "load_klass_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_klass_patching));
+      }
+      break;
+
+    case load_mirror_patching_id:
+      { StubFrame f(sasm, "load_mirror_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_mirror_patching));
+      }
+      break;
+
+    case load_appendix_patching_id:
+      { StubFrame f(sasm, "load_appendix_patching", dont_gc_arguments);
+        // we should set up register map
+        oop_maps = generate_patching(sasm, CAST_FROM_FN_PTR(address, move_appendix_patching));
+      }
+      break;
+
+    case handle_exception_nofpu_id:
+    case handle_exception_id:
+      { StubFrame f(sasm, "handle_exception", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case handle_exception_from_callee_id:
+      { StubFrame f(sasm, "handle_exception_from_callee", dont_gc_arguments);
+        oop_maps = generate_handle_exception(id, sasm);
+      }
+      break;
+
+    case throw_index_exception_id:
+      { StubFrame f(sasm, "index_range_check_failed", dont_gc_arguments);
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_index_exception), true);
+      }
+      break;
+
+    case throw_array_store_exception_id:
+      { StubFrame f(sasm, "throw_array_store_exception", dont_gc_arguments);
+        // tos + 0: link
+        //     + 1: return address
+        oop_maps = generate_exception_throw(sasm, CAST_FROM_FN_PTR(address, throw_array_store_exception), true);
+      }
+      break;
+
+    case predicate_failed_trap_id:
+      {
+        StubFrame f(sasm, "predicate_failed_trap", dont_gc_arguments);
+
+        OopMap* map = save_live_registers(sasm);
+
+        int call_offset = __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, predicate_failed_trap));
+        oop_maps = new OopMapSet();
+        oop_maps->add_gc_map(call_offset, map);
+        restore_live_registers(sasm);
+        __ leave();
+        DeoptimizationBlob* deopt_blob = SharedRuntime::deopt_blob();
+        assert(deopt_blob != NULL, "deoptimization blob must have been created");
+
+        __ far_jump(RuntimeAddress(deopt_blob->unpack_with_reexecution()));
+      }
+      break;
+
+
+    default:
+      { StubFrame f(sasm, "unimplemented entry", dont_gc_arguments);
+        __ mov(r0, (int)id);
+        __ call_RT(noreg, noreg, CAST_FROM_FN_PTR(address, unimplemented_entry), r0);
+        __ should_not_reach_here();
+      }
+      break;
+    }
+  }
+  return oop_maps;
+}
+
+#undef __
+
+const char *Runtime1::pd_name_for_address(address entry) {
+#ifdef __SOFTFP__
+#define FUNCTION_CASE(a, f) \
+  if ((intptr_t)a == CAST_FROM_FN_PTR(intptr_t, f))  return #f
+
+  FUNCTION_CASE(entry, SharedRuntime::i2f);
+  FUNCTION_CASE(entry, SharedRuntime::i2d);
+  FUNCTION_CASE(entry, SharedRuntime::f2d);
+  FUNCTION_CASE(entry, SharedRuntime::fcmpg);
+  FUNCTION_CASE(entry, SharedRuntime::fcmpl);
+  FUNCTION_CASE(entry, SharedRuntime::dcmpg);
+  FUNCTION_CASE(entry, SharedRuntime::dcmpl);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_fcmple);
+  FUNCTION_CASE(entry, SharedRuntime::unordered_dcmple);
+#undef FUNCTION_CASE
+#endif
+
+  return "Unknown_Func_Ptr";
+}
--- /dev/null	2018-09-25 19:24:33.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c1_globals_aarch32.hpp	2018-09-25 19:24:32.000000000 +0300
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP
+#define CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+// Sets the default values for platform dependent flags used by the client compiler.
+// (see c1_globals.hpp)
+
+#ifndef TIERED
+define_pd_global(bool, BackgroundCompilation,        true );
+define_pd_global(bool, UseTLAB,                      true );
+define_pd_global(bool, ResizeTLAB,                   true );
+define_pd_global(bool, InlineIntrinsics,             true );
+define_pd_global(bool, PreferInterpreterNativeStubs, false);
+define_pd_global(bool, ProfileTraps,                 false);
+define_pd_global(bool, UseOnStackReplacement,        true);
+define_pd_global(bool, TieredCompilation,            false);
+define_pd_global(intx, CompileThreshold,             1500 );
+
+define_pd_global(intx, OnStackReplacePercentage,     933  );
+define_pd_global(intx, FreqInlineSize,               325  );
+define_pd_global(intx, NewSizeThreadIncrease,        4*K  );
+define_pd_global(intx, InitialCodeCacheSize,         160*K);
+define_pd_global(intx, ReservedCodeCacheSize,        32*M );
+define_pd_global(intx, NonProfiledCodeHeapSize,      13*M );
+define_pd_global(intx, ProfiledCodeHeapSize,         14*M );
+define_pd_global(intx, NonNMethodCodeHeapSize,       5*M  );
+define_pd_global(bool, ProfileInterpreter,           false);
+define_pd_global(intx, CodeCacheExpansionSize,       32*K );
+define_pd_global(uintx, CodeCacheMinBlockLength,     1);
+define_pd_global(uintx, CodeCacheMinimumUseSpace,    400*K);
+define_pd_global(uintx, MetaspaceSize,               12*M );
+define_pd_global(bool, NeverActAsServerClassMachine, true );
+define_pd_global(uint64_t,MaxRAM,                    1ULL*G);
+define_pd_global(bool, CICompileOSR,                 true );
+#endif // !TIERED
+define_pd_global(bool, UseTypeProfile,               false);
+define_pd_global(bool, RoundFPResults,               true );
+
+define_pd_global(bool, LIRFillDelaySlots,            false);
+define_pd_global(bool, OptimizeSinglePrecision,      true );
+define_pd_global(bool, CSEArrayLength,               true );
+define_pd_global(bool, TwoOperandLIRForm,            false);
+
+#endif // CPU_AARCH32_VM_C1_GLOBALS_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:34.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/c2_globals_aarch32.hpp	2018-09-25 19:24:33.000000000 +0300
@@ -0,0 +1,95 @@
+/*
+ * Copyright (c) 2000, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_C2_GLOBALS_AARCH32_HPP
+#define CPU_AARCH32_VM_C2_GLOBALS_AARCH32_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+//
+// Sets the default values for platform dependent flags used by the server compiler.
+// (see c2_globals.hpp).  Alpha-sorted.
+
+define_pd_global(bool, BackgroundCompilation,        true);
+define_pd_global(bool, CICompileOSR,                 true);
+define_pd_global(bool, InlineIntrinsics,             false); // TODO FIXME temporary, please enable
+define_pd_global(bool, PreferInterpreterNativeStubs, false);
+define_pd_global(bool, ProfileTraps,                 true);
+define_pd_global(bool, UseOnStackReplacement,        true);
+define_pd_global(bool, ProfileInterpreter,           true);
+define_pd_global(bool, TieredCompilation,            false);
+define_pd_global(intx, CompileThreshold,             10000);
+
+define_pd_global(intx, OnStackReplacePercentage,     140);
+define_pd_global(intx, ConditionalMoveLimit,         4);
+// C2 gets to use all the float/double registers
+define_pd_global(intx, FLOATPRESSURE,                30);
+define_pd_global(intx, FreqInlineSize,               175);
+define_pd_global(intx, INTPRESSURE,                  12);
+define_pd_global(intx, InteriorEntryAlignment,       32);  // = CodeEntryAlignment
+define_pd_global(size_t, NewSizeThreadIncrease,      ScaleForWordSize(4*K));
+// The default setting 16/16 seems to work best.
+// (For _228_jack 16/16 is 2% better than 4/4, 16/4, 32/32, 32/16, or 16/32.)
+//define_pd_global(intx, OptoLoopAlignment,            16);  // = 4*wordSize
+define_pd_global(intx, RegisterCostAreaRatio,        16000);
+define_pd_global(bool, UseTLAB,                      true);
+define_pd_global(bool, ResizeTLAB,                   true);
+define_pd_global(intx, LoopUnrollLimit,              60);
+define_pd_global(intx, LoopPercentProfileLimit,      10);
+define_pd_global(intx, MinJumpTableSize,             16);
+
+// Peephole and CISC spilling both break the graph, and so makes the
+// scheduler sick.
+define_pd_global(bool, OptoPeephole,                 false);
+define_pd_global(bool, UseCISCSpill,                 false);
+define_pd_global(bool, OptoBundling,                 false);
+define_pd_global(bool, OptoScheduling,               true);
+define_pd_global(bool, OptoRegScheduling,            false);
+define_pd_global(bool, SuperWordLoopUnrollAnalysis,  false);
+define_pd_global(bool, IdealizeClearArrayNode,       true);
+
+// InitialCodeCacheSize derived from specjbb2000 run.
+define_pd_global(size_t, InitialCodeCacheSize,       1536*K); // Integral multiple of CodeCacheExpansionSize
+define_pd_global(size_t, ReservedCodeCacheSize,      32*M);
+define_pd_global(size_t, NonProfiledCodeHeapSize,    13*M);
+define_pd_global(size_t, ProfiledCodeHeapSize,       14*M);
+define_pd_global(size_t, NonNMethodCodeHeapSize,     5*M );
+define_pd_global(size_t, CodeCacheExpansionSize,     32*K);
+// Ergonomics related flags
+define_pd_global(uint64_t, MaxRAM,                   4ULL*G);
+define_pd_global(uintx, CodeCacheMinBlockLength,     4);
+define_pd_global(size_t, CodeCacheMinimumUseSpace,   400*K);
+
+define_pd_global(bool,  TrapBasedRangeChecks,        false); // Not needed
+
+// Heap related flags
+define_pd_global(size_t, MetaspaceSize,              ScaleForWordSize(16*M));
+
+// Ergonomics related flags
+define_pd_global(bool, NeverActAsServerClassMachine, false);
+
+#endif // CPU_AARCH32_VM_C2_GLOBALS_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:35.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/codeBuffer_aarch32.hpp	2018-09-25 19:24:34.000000000 +0300
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_CODEBUFFER_AARCH32_HPP
+#define CPU_AARCH32_VM_CODEBUFFER_AARCH32_HPP
+
+private:
+  void pd_initialize() {}
+
+public:
+  void flush_bundle(bool start_new_bundle) {}
+
+#endif // CPU_AARCH32_VM_CODEBUFFER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:36.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/compiledIC_aarch32.cpp	2018-09-25 19:24:35.000000000 +0300
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/compiledIC.hpp"
+#include "code/icBuffer.hpp"
+#include "code/nmethod.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/mutexLocker.hpp"
+#include "runtime/safepoint.hpp"
+
+// ----------------------------------------------------------------------------
+
+#define __ _masm.
+address CompiledStaticCall::emit_to_interp_stub(CodeBuffer &cbuf, address mark) {
+  // Stub is fixed up when the corresponding call is converted from
+  // calling compiled code to calling interpreted code.
+  // mov rmethod, 0
+  // jmp -4 # to self
+
+  if (mark == NULL) {
+    mark = cbuf.insts_mark();  // Get mark within main instrs section.
+  }
+
+  // Note that the code buffer's insts_mark is always relative to insts.
+  // That's why we must use the macroassembler to generate a stub.
+  MacroAssembler _masm(&cbuf);
+
+  address base = __ start_a_stub(to_interp_stub_size());
+
+  int offset = __ offset();
+  if (base == NULL) {
+    return NULL;  // CodeBuffer::expand failed
+  }
+  // static stub relocation stores the instruction address of the call
+  __ relocate(static_stub_Relocation::spec(mark));
+  // static stub relocation also tags the Method* in the code-stream.
+  __ mov_metadata(rmethod, (Metadata*)NULL);
+  __ movptr(rscratch1, 0);
+  __ b(rscratch1);
+
+  assert((__ offset() - offset) <= (int)to_interp_stub_size(), "stub too big");
+  __ end_a_stub();
+  return base;
+}
+#undef __
+
+int CompiledStaticCall::to_interp_stub_size() {
+  return 7 * NativeInstruction::arm_insn_sz;
+}
+
+int CompiledStaticCall::to_trampoline_stub_size() {
+  // AArch32 doesn't use trampoline stubs.
+  return 0;
+}
+
+// Relocation entries for call stub, compiled java to interpreter.
+int CompiledStaticCall::reloc_to_interp_stub() {
+  return 4; // 3 in emit_to_interp_stub + 1 in emit_call
+}
+
+void CompiledDirectStaticCall::set_to_interpreted(const methodHandle& callee, address entry) {
+  address stub = find_stub(false /* is_aot */);
+  guarantee(stub != NULL, "stub not found");
+
+  if (TraceICs) {
+    ResourceMark rm;
+    tty->print_cr("CompiledDirectStaticCall@" INTPTR_FORMAT ": set_to_interpreted %s",
+                  p2i(instruction_address()),
+                  callee->name_and_sig_as_C_string());
+  }
+
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump* jump = NativeJump::from(method_holder->next_instruction_address());
+#ifndef PRODUCT
+  // read the value once
+  volatile intptr_t data = method_holder->data();
+  assert(data == 0 || data == (intptr_t)callee(),
+         "a) MT-unsafe modification of inline cache");
+  assert(data == 0 || jump->jump_destination() == entry,
+         "b) MT-unsafe modification of inline cache");
+#endif
+  // Update stub.
+  method_holder->set_data((intptr_t)callee());
+  jump->set_jump_destination(entry);
+  ICache::invalidate_range(stub, to_interp_stub_size());
+  // Update jump to call.
+  set_destination_mt_safe(stub);
+}
+
+void CompiledDirectStaticCall::set_stub_to_clean(static_stub_Relocation* static_stub) {
+  assert (CompiledIC_lock->is_locked() || SafepointSynchronize::is_at_safepoint(), "mt unsafe call");
+  // Reset stub.
+  address stub = static_stub->addr();
+  assert(stub != NULL, "stub not found");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  method_holder->set_data(0);
+}
+
+//-----------------------------------------------------------------------------
+// Non-product mode code
+#ifndef PRODUCT
+
+void CompiledDirectStaticCall::verify() {
+  // Verify call.
+  _call->verify();
+  if (os::is_MP()) {
+    _call->verify_alignment();
+  }
+
+  // Verify stub.
+  address stub = find_stub(false /* is_aot */);
+  assert(stub != NULL, "no stub found for static call");
+  // Creation also verifies the object.
+  NativeMovConstReg* method_holder = nativeMovConstReg_at(stub);
+  NativeJump*        jump          = nativeJump_at(method_holder->next_instruction_address());
+
+  // Verify state.
+  assert(is_clean() || is_call_to_compiled() || is_call_to_interpreted(), "sanity check");
+}
+
+#endif // !PRODUCT
--- /dev/null	2018-09-25 19:24:37.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/copy_aarch32.hpp	2018-09-25 19:24:37.000000000 +0300
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_COPY_AARCH32_HPP
+#define CPU_AARCH32_VM_COPY_AARCH32_HPP
+
+// Inline functions for memory copy and fill.
+
+// Contains inline asm implementations
+#include OS_CPU_HEADER_INLINE(copy)
+
+
+static void pd_fill_to_words(HeapWord* tohw, size_t count, juint value) {
+/*  julong* to = (julong*) tohw;
+  julong  v  = ((julong) value << 32) | value;
+  while (count-- > 0) {
+    *to++ = v;
+  }*/
+  juint *to = (juint*)tohw;
+  while(count-- > 0) {
+    *to++ = value;
+  }
+}
+
+static void pd_fill_to_aligned_words(HeapWord* tohw, size_t count, juint value) {
+  pd_fill_to_words(tohw, count, value);
+}
+
+static void pd_fill_to_bytes(void* to, size_t count, jubyte value) {
+  (void)memset(to, value, count);
+}
+
+static void pd_zero_to_words(HeapWord* tohw, size_t count) {
+  pd_fill_to_words(tohw, count, 0);
+}
+
+static void pd_zero_to_bytes(void* to, size_t count) {
+  (void)memset(to, 0, count);
+}
+
+#endif // CPU_AARCH32_VM_COPY_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:38.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/cpustate_aarch32.hpp	2018-09-25 19:24:38.000000000 +0300
@@ -0,0 +1,596 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef _CPU_STATE_H
+#define _CPU_STATE_H
+
+#include <sys/types.h>
+
+/*
+ * symbolic names used to identify general registers which also match
+ * the registers indices in machine code
+ *
+ * We have 32 general registers which can be read/written as 32 bit or
+ * 64 bit sources/sinks and are appropriately referred to as Wn or Xn
+ * in the assembly code.  Some instructions mix these access modes
+ * (e.g. ADD X0, X1, W2) so the implementation of the instruction
+ * needs to *know* which type of read or write access is required.
+ */
+enum GReg {
+  R0,
+  R1,
+  R2,
+  R3,
+  R4,
+  R5,
+  R6,
+  R7,
+  R8,
+  R9,
+  R10,
+  R11,
+  R12,
+  R13,
+  R14,
+  R15,
+  R16,
+  R17,
+  R18,
+  R19,
+  R20,
+  R21,
+  R22,
+  R23,
+  R24,
+  R25,
+  R26,
+  R27,
+  R28,
+  R29,
+  R30,
+  R31,
+  // and now the aliases
+  RSCRATCH1=R8,
+  RSCRATCH2=R9,
+  RMETHOD=R12,
+  RESP=R20,
+  RDISPATCH=R21,
+  RBCP=R22,
+  RLOCALS=R24,
+  RMONITORS=R25,
+  RCPOOL=R26,
+  RHEAPBASE=R27,
+  RTHREAD=R28,
+  FP = R29,
+  LR = R30,
+  SP = R31,
+  ZR = R31
+};
+
+/*
+ * symbolic names used to refer to floating point registers which also
+ * match the registers indices in machine code
+ *
+ * We have 32 FP registers which can be read/written as 8, 16, 32, 64
+ * and 128 bit sources/sinks and are appropriately referred to as Bn,
+ * Hn, Sn, Dn and Qn in the assembly code. Some instructions mix these
+ * access modes (e.g. FCVT S0, D0) so the implementation of the
+ * instruction needs to *know* which type of read or write access is
+ * required.
+ */
+
+enum VReg {
+  V0,
+  V1,
+  V2,
+  V3,
+  V4,
+  V5,
+  V6,
+  V7,
+  V8,
+  V9,
+  V10,
+  V11,
+  V12,
+  V13,
+  V14,
+  V15,
+  V16,
+  V17,
+  V18,
+  V19,
+  V20,
+  V21,
+  V22,
+  V23,
+  V24,
+  V25,
+  V26,
+  V27,
+  V28,
+  V29,
+  V30,
+  V31,
+};
+
+/**
+ * all the different integer bit patterns for the components of a
+ * general register are overlaid here using a union so as to allow all
+ * reading and writing of the desired bits.
+ *
+ * n.b. the ARM spec says that when you write a 32 bit register you
+ * are supposed to write the low 32 bits and zero the high 32
+ * bits. But we don't actually have to care about this because Java
+ * will only ever consume the 32 bits value as a 64 bit quantity after
+ * an explicit extend.
+ */
+union GRegisterValue
+{
+  int8_t s8;
+  int16_t s16;
+  int32_t s32;
+  int64_t s64;
+  u_int8_t u8;
+  u_int16_t u16;
+  u_int32_t u32;
+  u_int64_t u64;
+};
+
+class GRegister
+{
+public:
+  GRegisterValue value;
+};
+
+/*
+ * float registers provide for storage of a single, double or quad
+ * word format float in the same register. single floats are not
+ * paired within each double register as per 32 bit arm. instead each
+ * 128 bit register Vn embeds the bits for Sn, and Dn in the lower
+ * quarter and half, respectively, of the bits for Qn.
+ *
+ * The upper bits can also be accessed as single or double floats by
+ * the float vector operations using indexing e.g. V1.D[1], V1.S[3]
+ * etc and, for SIMD operations using a horrible index range notation.
+ *
+ * The spec also talks about accessing float registers as half words
+ * and bytes with Hn and Bn providing access to the low 16 and 8 bits
+ * of Vn but it is not really clear what these bits represent. We can
+ * probably ignore this for Java anyway. However, we do need to access
+ * the raw bits at 32 and 64 bit resolution to load to/from integer
+ * registers.
+ */
+
+union FRegisterValue
+{
+  float s;
+  double d;
+  long double q;
+  // eventually we will need to be able to access the data as a vector
+  // the integral array elements allow us to access the bits in s, d,
+  // q, vs and vd at an appropriate level of granularity
+  u_int8_t vb[16];
+  u_int16_t vh[8];
+  u_int32_t vw[4];
+  u_int64_t vx[2];
+  float vs[4];
+  double vd[2];
+};
+
+class FRegister
+{
+public:
+  FRegisterValue value;
+};
+
+/*
+ * CPSR register -- this does not exist as a directly accessible
+ * register but we need to store the flags so we can implement
+ * flag-seting and flag testing operations
+ *
+ * we can possibly use injected x86 asm to report the outcome of flag
+ * setting operations. if so we will need to grab the flags
+ * immediately after the operation in order to ensure we don't lose
+ * them because of the actions of the simulator. so we still need
+ * somewhere to store the condition codes.
+ */
+
+class CPSRRegister
+{
+public:
+  u_int32_t value;
+
+/*
+ * condition register bit select values
+ *
+ * the order of bits here is important because some of
+ * the flag setting conditional instructions employ a
+ * bit field to populate the flags when a false condition
+ * bypasses execution of the operation and we want to
+ * be able to assign the flags register using the
+ * supplied value.
+ */
+
+  enum CPSRIdx {
+    V_IDX,
+    C_IDX,
+    Z_IDX,
+    N_IDX
+  };
+
+  enum CPSRMask {
+    V = 1 << V_IDX,
+    C = 1 << C_IDX,
+    Z = 1 << Z_IDX,
+    N = 1 << N_IDX
+  };
+
+  static const int CPSR_ALL_FLAGS = (V | C | Z | N);
+};
+
+// auxiliary function to assemble the relevant bits from
+// the x86 EFLAGS register into an ARM CPSR value
+
+#define X86_V_IDX 11
+#define X86_C_IDX 0
+#define X86_Z_IDX 6
+#define X86_N_IDX 7
+
+#define X86_V (1 << X86_V_IDX)
+#define X86_C (1 << X86_C_IDX)
+#define X86_Z (1 << X86_Z_IDX)
+#define X86_N (1 << X86_N_IDX)
+
+inline u_int32_t convertX86Flags(u_int32_t x86flags)
+{
+  u_int32_t flags;
+  // set N flag
+  flags = ((x86flags & X86_N) >> X86_N_IDX);
+  // shift then or in Z flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_Z) >> X86_Z_IDX);
+  // shift then or in C flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_C) >> X86_C_IDX);
+  // shift then or in V flag
+  flags <<= 1;
+  flags |= ((x86flags & X86_V) >> X86_V_IDX);
+
+  return flags;
+}
+
+inline u_int32_t convertX86FlagsFP(u_int32_t x86flags)
+{
+  // x86 flags set by fcomi(x,y) are ZF:PF:CF
+  // (yes, that's PF for parity, WTF?)
+  // where
+  // 0) 0:0:0 means x > y
+  // 1) 0:0:1 means x < y
+  // 2) 1:0:0 means x = y
+  // 3) 1:1:1 means x and y are unordered
+  // note that we don't have to check PF so
+  // we really have a simple 2-bit case switch
+  // the corresponding ARM64 flags settings
+  //  in hi->lo bit order are
+  // 0) --C-
+  // 1) N---
+  // 2) -ZC-
+  // 3) --CV
+
+  static u_int32_t armFlags[] = {
+      0b0010,
+      0b1000,
+      0b0110,
+      0b0011
+  };
+  // pick out the ZF and CF bits
+  u_int32_t zc = ((x86flags & X86_Z) >> X86_Z_IDX);
+  zc <<= 1;
+  zc |= ((x86flags & X86_C) >> X86_C_IDX);
+
+  return armFlags[zc];
+}
+
+/*
+ * FPSR register -- floating point status register
+
+ * this register includes IDC, IXC, UFC, OFC, DZC, IOC and QC bits,
+ * and the floating point N, Z, C, V bits but the latter are unused in
+ * aarch32 mode. the sim ignores QC for now.
+ *
+ * bit positions are as per the ARMv7 FPSCR register
+ *
+ * IDC :  7 ==> Input Denormal (cumulative exception bit)
+ * IXC :  4 ==> Inexact
+ * UFC :  3 ==> Underflow
+ * OFC :  2 ==> Overflow
+ * DZC :  1 ==> Division by Zero
+ * IOC :  0 ==> Invalid Operation
+ */
+
+class FPSRRegister
+{
+public:
+  u_int32_t value;
+  // indices for bits in the FPSR register value
+  enum FPSRIdx {
+    IO_IDX = 0,
+    DZ_IDX = 1,
+    OF_IDX = 2,
+    UF_IDX = 3,
+    IX_IDX = 4,
+    ID_IDX = 7
+  };
+  // corresponding bits as numeric values
+  enum FPSRMask {
+    IO = (1 << IO_IDX),
+    DZ = (1 << DZ_IDX),
+    OF = (1 << OF_IDX),
+    UF = (1 << UF_IDX),
+    IX = (1 << IX_IDX),
+    ID = (1 << ID_IDX)
+  };
+  static const int FPSR_ALL_FPSRS = (IO | DZ | OF | UF | IX | ID);
+};
+
+// debugger support
+
+enum PrintFormat
+{
+  FMT_DECIMAL,
+  FMT_HEX,
+  FMT_SINGLE,
+  FMT_DOUBLE,
+  FMT_QUAD,
+  FMT_MULTI
+};
+
+/*
+ * model of the registers and other state associated with the cpu
+ */
+class CPUState
+{
+  friend class AArch64Simulator;
+private:
+  // this is the PC of the instruction being executed
+  u_int64_t pc;
+  // this is the PC of the instruction to be executed next
+  // it is defaulted to pc + 4 at instruction decode but
+  // execute may reset it
+
+  u_int64_t nextpc;
+  GRegister gr[33];             // extra register at index 32 is used
+                                // to hold zero value
+  FRegister fr[32];
+  CPSRRegister cpsr;
+  FPSRRegister fpsr;
+
+public:
+
+  CPUState() {
+    gr[20].value.u64 = 0;  // establish initial condition for
+                           // checkAssertions()
+    trace_counter = 0;
+  }
+
+  // General Register access macros
+
+  // only xreg or xregs can be used as an lvalue in order to update a
+  // register. this ensures that the top part of a register is always
+  // assigned when it is written by the sim.
+
+  inline u_int64_t &xreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u64;
+    } else {
+      return gr[reg].value.u64;
+    }
+  }
+
+  inline int64_t &xregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s64;
+    } else {
+      return gr[reg].value.s64;
+    }
+  }
+
+  inline u_int32_t wreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u32;
+    } else {
+      return gr[reg].value.u32;
+    }
+  }
+
+  inline int32_t wregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s32;
+    } else {
+      return gr[reg].value.s32;
+    }
+  }
+
+  inline u_int32_t hreg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u16;
+    } else {
+      return gr[reg].value.u16;
+    }
+  }
+
+  inline int32_t hregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s16;
+    } else {
+      return gr[reg].value.s16;
+    }
+  }
+
+  inline u_int32_t breg(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.u8;
+    } else {
+      return gr[reg].value.u8;
+    }
+  }
+
+  inline int32_t bregs(GReg reg, int r31_is_sp) {
+    if (reg == R31 && !r31_is_sp) {
+      return gr[32].value.s8;
+    } else {
+      return gr[reg].value.s8;
+    }
+  }
+
+  // FP Register access macros
+
+  // all non-vector accessors return a reference so we can both read
+  // and assign
+
+  inline float &sreg(VReg reg) {
+    return fr[reg].value.s;
+  }
+
+  inline double &dreg(VReg reg) {
+    return fr[reg].value.d;
+  }
+
+  inline long double &qreg(VReg reg) {
+    return fr[reg].value.q;
+  }
+
+  // all vector register accessors return a pointer
+
+  inline float *vsreg(VReg reg) {
+    return &fr[reg].value.vs[0];
+  }
+
+  inline double *vdreg(VReg reg) {
+    return &fr[reg].value.vd[0];
+  }
+
+  inline u_int8_t *vbreg(VReg reg) {
+    return &fr[reg].value.vb[0];
+  }
+
+  inline u_int16_t *vhreg(VReg reg) {
+    return &fr[reg].value.vh[0];
+  }
+
+  inline u_int32_t *vwreg(VReg reg) {
+    return &fr[reg].value.vw[0];
+  }
+
+  inline u_int64_t *vxreg(VReg reg) {
+    return &fr[reg].value.vx[0];
+  }
+
+  union GRegisterValue prev_sp, prev_fp;
+
+  static const int trace_size = 256;
+  u_int64_t trace_buffer[trace_size];
+  int trace_counter;
+
+  bool checkAssertions()
+  {
+    // Make sure that SP is 16-aligned
+    // Also make sure that ESP is above SP.
+    // We don't care about checking ESP if it is null, i.e. it hasn't
+    // been used yet.
+    if (gr[31].value.u64 & 0x0f) {
+      asm volatile("nop");
+      return false;
+    }
+    return true;
+  }
+
+  // pc register accessors
+
+  // this instruction can be used to fetch the current PC
+  u_int64_t getPC();
+  // instead of setting the current PC directly you can
+  // first set the next PC (either absolute or PC-relative)
+  // and later copy the next PC into the current PC
+  // this supports a default increment by 4 at instruction
+  // fetch with an optional reset by control instructions
+  u_int64_t getNextPC();
+  void setNextPC(u_int64_t next);
+  void offsetNextPC(int64_t offset);
+  // install nextpc as current pc
+  void updatePC();
+
+  // this instruction can be used to save the next PC to LR
+  // just before installing a branch PC
+  inline void saveLR() { gr[LR].value.u64 = nextpc; }
+
+  // cpsr register accessors
+  u_int32_t getCPSRRegister();
+  void setCPSRRegister(u_int32_t flags);
+  // read a specific subset of the flags as a bit pattern
+  // mask should be composed using elements of enum FlagMask
+  u_int32_t getCPSRBits(u_int32_t mask);
+  // assign a specific subset of the flags as a bit pattern
+  // mask and value should be composed using elements of enum FlagMask
+  void setCPSRBits(u_int32_t mask, u_int32_t value);
+  // test the value of a single flag returned as 1 or 0
+  u_int32_t testCPSR(CPSRRegister::CPSRIdx idx);
+  // set a single flag
+  void setCPSR(CPSRRegister::CPSRIdx idx);
+  // clear a single flag
+  void clearCPSR(CPSRRegister::CPSRIdx idx);
+  // utility method to set ARM CSPR flags from an x86 bit mask generated by integer arithmetic
+  void setCPSRRegisterFromX86(u_int64_t x86Flags);
+  // utility method to set ARM CSPR flags from an x86 bit mask generated by floating compare
+  void setCPSRRegisterFromX86FP(u_int64_t x86Flags);
+
+  // fpsr register accessors
+  u_int32_t getFPSRRegister();
+  void setFPSRRegister(u_int32_t flags);
+  // read a specific subset of the fprs bits as a bit pattern
+  // mask should be composed using elements of enum FPSRRegister::FlagMask
+  u_int32_t getFPSRBits(u_int32_t mask);
+  // assign a specific subset of the flags as a bit pattern
+  // mask and value should be composed using elements of enum FPSRRegister::FlagMask
+  void setFPSRBits(u_int32_t mask, u_int32_t value);
+  // test the value of a single flag returned as 1 or 0
+  u_int32_t testFPSR(FPSRRegister::FPSRIdx idx);
+  // set a single flag
+  void setFPSR(FPSRRegister::FPSRIdx idx);
+  // clear a single flag
+  void clearFPSR(FPSRRegister::FPSRIdx idx);
+
+  // debugger support
+  void printPC(int pending, const char *trailing = "\n");
+  void printInstr(u_int32_t instr, void (*dasm)(u_int64_t), const char *trailing = "\n");
+  void printGReg(GReg reg, PrintFormat format = FMT_HEX, const char *trailing = "\n");
+  void printVReg(VReg reg, PrintFormat format = FMT_HEX, const char *trailing = "\n");
+  void printCPSR(const char *trailing = "\n");
+  void printFPSR(const char *trailing = "\n");
+  void dumpState();
+};
+
+#endif // ifndef _CPU_STATE_H
--- /dev/null	2018-09-25 19:24:39.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/debug_aarch32.cpp	2018-09-25 19:24:39.000000000 +0300
@@ -0,0 +1,35 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "code/codeCache.hpp"
+#include "code/nmethod.hpp"
+#include "runtime/frame.hpp"
+#include "runtime/init.hpp"
+#include "runtime/os.hpp"
+#include "utilities/debug.hpp"
+
+void pd_ps(frame f) {}
--- /dev/null	2018-09-25 19:24:40.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/decode_aarch32.hpp	2018-09-25 19:24:40.000000000 +0300
@@ -0,0 +1,413 @@
+/*
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef _DECODE_H
+#define _DECODE_H
+
+#include <sys/types.h>
+#include "cpustate_aarch32.hpp"
+
+// bitfield immediate expansion helper
+
+extern int expandLogicalImmediate(u_int32_t immN, u_int32_t immr,
+                                    u_int32_t imms, u_int64_t &bimm);
+
+
+/*
+ * codes used in conditional instructions
+ *
+ * these are passed to conditional operations to identify which
+ * condition to test for
+ */
+enum CondCode {
+  EQ = 0b0000, // meaning Z == 1
+  NE = 0b0001, // meaning Z == 0
+  HS = 0b0010, // meaning C == 1
+  CS = HS,
+  LO = 0b0011, // meaning C == 0
+  CC = LO,
+  MI = 0b0100, // meaning N == 1
+  PL = 0b0101, // meaning N == 0
+  VS = 0b0110, // meaning V == 1
+  VC = 0b0111, // meaning V == 0
+  HI = 0b1000, // meaning C == 1 && Z == 0
+  LS = 0b1001, // meaning !(C == 1 && Z == 0)
+  GE = 0b1010, // meaning N == V
+  LT = 0b1011, // meaning N != V
+  GT = 0b1100, // meaning Z == 0 && N == V
+  LE = 0b1101, // meaning !(Z == 0 && N == V)
+  AL = 0b1110, // meaning ANY
+  NV = 0b1111  // ditto
+};
+
+/*
+ * certain addressing modes for load require pre or post writeback of
+ * the computed address to a base register
+ */
+enum WriteBack {
+  Post = 0,
+  Pre = 1
+};
+
+/*
+ * certain addressing modes for load require an offset to
+ * be optionally scaled so the decode needs to pass that
+ * through to the execute routine
+ */
+enum Scaling {
+  Unscaled = 0,
+  Scaled = 1
+};
+
+/*
+ * when we do have to scale we do so by shifting using
+ * log(bytes in data element - 1) as the shift count.
+ * so we don't have to scale offsets when loading
+ * bytes.
+ */
+enum ScaleShift {
+  ScaleShift16 = 1,
+  ScaleShift32 = 2,
+  ScaleShift64 = 3,
+  ScaleShift128 = 4
+};
+
+/*
+ * one of the addressing modes for load requires a 32-bit register
+ * value to be either zero- or sign-extended for these instructions
+ * UXTW or SXTW should be passed
+ *
+ * arithmetic register data processing operations can optionally
+ * extend a portion of the second register value for these
+ * instructions the value supplied must identify the portion of the
+ * register which is to be zero- or sign-exended
+ */
+enum Extension {
+  UXTB = 0,
+  UXTH = 1,
+  UXTW = 2,
+  UXTX = 3,
+  SXTB = 4,
+  SXTH = 5,
+  SXTW = 6,
+  SXTX = 7
+};
+
+/*
+ * arithmetic and logical register data processing operations
+ * optionally perform a shift on the second register value
+ */
+enum Shift {
+  LSL = 0,
+  LSR = 1,
+  ASR = 2,
+  ROR = 3
+};
+
+/*
+ * bit twiddling helpers for instruction decode
+ */
+
+// 32 bit mask with bits [hi,...,lo] set
+
+static inline u_int32_t mask32(int hi = 31, int lo = 0)
+{
+  int nbits = (hi + 1) - lo;
+  return ((1 << nbits) - 1) << lo;
+}
+
+static inline u_int64_t mask64(int hi = 63, int lo = 0)
+{
+  int nbits = (hi + 1) - lo;
+  return ((1L << nbits) - 1) << lo;
+}
+
+// pick bits [hi,...,lo] from val
+static inline u_int32_t pick32(u_int32_t val, int hi = 31, int lo = 0)
+{
+  return (val & mask32(hi, lo));
+}
+
+// pick bits [hi,...,lo] from val
+static inline u_int64_t pick64(u_int64_t val, int hi = 31, int lo = 0)
+{
+  return (val & mask64(hi, lo));
+}
+
+// pick bits [hi,...,lo] from val and shift to [(hi-(newlo - lo)),newlo]
+static inline u_int32_t pickshift32(u_int32_t val, int hi = 31,
+                                    int lo = 0, int newlo = 0)
+{
+  u_int32_t bits = pick32(val, hi, lo);
+  if (lo < newlo) {
+    return (bits << (newlo - lo));
+  } else {
+    return (bits >> (lo - newlo));
+  }
+}
+// mask [hi,lo] and shift down to start at bit 0
+static inline u_int32_t pickbits32(u_int32_t val, int hi = 31, int lo = 0)
+{
+  return (pick32(val, hi, lo) >> lo);
+}
+
+// mask [hi,lo] and shift down to start at bit 0
+static inline u_int64_t pickbits64(u_int64_t val, int hi = 63, int lo = 0)
+{
+  return (pick64(val, hi, lo) >> lo);
+}
+
+/*
+ * decode registers, immediates and constants of various types
+ */
+
+static inline GReg greg(u_int32_t val, int lo)
+{
+  return (GReg)pickbits32(val, lo + 4, lo);
+}
+
+static inline VReg vreg(u_int32_t val, int lo)
+{
+  return (VReg)pickbits32(val, lo + 4, lo);
+}
+
+static inline u_int32_t uimm(u_int32_t val, int hi, int lo)
+{
+  return pickbits32(val, hi, lo);
+}
+
+static inline int32_t simm(u_int32_t val, int hi = 31, int lo = 0) {
+  union {
+    u_int32_t u;
+    int32_t n;
+  };
+
+  u = val << (31 - hi);
+  n = n >> (31 - hi + lo);
+  return n;
+}
+
+static inline int64_t simm(u_int64_t val, int hi = 63, int lo = 0) {
+  union {
+    u_int64_t u;
+    int64_t n;
+  };
+
+  u = val << (63 - hi);
+  n = n >> (63 - hi + lo);
+  return n;
+}
+
+static inline Shift shift(u_int32_t val, int lo)
+{
+  return (Shift)pickbits32(val, lo+1, lo);
+}
+
+static inline Extension extension(u_int32_t val, int lo)
+{
+  return (Extension)pickbits32(val, lo+2, lo);
+}
+
+static inline Scaling scaling(u_int32_t val, int lo)
+{
+  return (Scaling)pickbits32(val, lo, lo);
+}
+
+static inline WriteBack writeback(u_int32_t val, int lo)
+{
+  return (WriteBack)pickbits32(val, lo, lo);
+}
+
+static inline CondCode condcode(u_int32_t val, int lo)
+{
+  return (CondCode)pickbits32(val, lo+3, lo);
+}
+
+/*
+ * operation decode
+ */
+// bits [28,25] are the primary dispatch vector
+
+static inline u_int32_t dispatchGroup(u_int32_t val)
+{
+  return pickshift32(val, 28, 25, 0);
+}
+
+/*
+ * the 16 possible values for bits [28,25] identified by tags which
+ * map them to the 5 main instruction groups LDST, DPREG, ADVSIMD,
+ * BREXSYS and DPIMM.
+ *
+ * An extra group PSEUDO is included in one of the unallocated ranges
+ * for simulator-specific pseudo-instructions.
+ */
+enum DispatchGroup {
+  GROUP_PSEUDO_0000,
+  GROUP_UNALLOC_0001,
+  GROUP_UNALLOC_0010,
+  GROUP_UNALLOC_0011,
+  GROUP_LDST_0100,
+  GROUP_DPREG_0101,
+  GROUP_LDST_0110,
+  GROUP_ADVSIMD_0111,
+  GROUP_DPIMM_1000,
+  GROUP_DPIMM_1001,
+  GROUP_BREXSYS_1010,
+  GROUP_BREXSYS_1011,
+  GROUP_LDST_1100,
+  GROUP_DPREG_1101,
+  GROUP_LDST_1110,
+  GROUP_ADVSIMD_1111
+};
+
+// bits [31, 29] of a Pseudo are the secondary dispatch vector
+
+static inline u_int32_t dispatchPseudo(u_int32_t val)
+{
+  return pickshift32(val, 31, 29, 0);
+}
+
+/*
+ * the 8 possible values for bits [31,29] in a Pseudo Instruction.
+ * Bits [28,25] are always 0000.
+ */
+
+enum DispatchPseudo {
+  PSEUDO_UNALLOC_000, // unallocated
+  PSEUDO_UNALLOC_001, // ditto
+  PSEUDO_UNALLOC_010, // ditto
+  PSEUDO_UNALLOC_011, // ditto
+  PSEUDO_UNALLOC_100, // ditto
+  PSEUDO_UNALLOC_101, // ditto
+  PSEUDO_CALLOUT_110, // CALLOUT -- bits [24,0] identify call/ret sig
+  PSEUDO_HALT_111     // HALT -- bits [24, 0] identify halt code
+};
+
+// bits [25, 23] of a DPImm are the secondary dispatch vector
+
+static inline u_int32_t dispatchDPImm(u_int32_t instr)
+{
+  return pickshift32(instr, 25, 23, 0);
+}
+
+/*
+ * the 8 possible values for bits [25,23] in a Data Processing Immediate
+ * Instruction. Bits [28,25] are always 100_.
+ */
+
+enum DispatchDPImm {
+  DPIMM_PCADR_000,  // PC-rel-addressing
+  DPIMM_PCADR_001,  // ditto
+  DPIMM_ADDSUB_010,  // Add/Subtract (immediate)
+  DPIMM_ADDSUB_011, // ditto
+  DPIMM_LOG_100,    // Logical (immediate)
+  DPIMM_MOV_101,    // Move Wide (immediate)
+  DPIMM_BITF_110,   // Bitfield
+  DPIMM_EXTR_111    // Extract
+};
+
+// bits [29,28:26] of a LS are the secondary dispatch vector
+
+static inline u_int32_t dispatchLS(u_int32_t instr)
+{
+  return (pickshift32(instr, 29, 28, 1) |
+          pickshift32(instr, 26, 26, 0));
+}
+
+/*
+ * the 8 possible values for bits [29,28:26] in a Load/Store
+ * Instruction. Bits [28,25] are always _1_0
+ */
+
+enum DispatchLS {
+  LS_EXCL_000,    // Load/store exclusive (includes some unallocated)
+  LS_ADVSIMD_001, // AdvSIMD load/store (various -- includes some unallocated)
+  LS_LIT_010,     // Load register literal (includes some unallocated)
+  LS_LIT_011,     // ditto
+  LS_PAIR_100,    // Load/store register pair (various)
+  LS_PAIR_101,    // ditto
+  LS_OTHER_110,   // other load/store formats
+  LS_OTHER_111    // ditto
+};
+
+// bits [28:24:21] of a DPReg are the secondary dispatch vector
+
+static inline u_int32_t dispatchDPReg(u_int32_t instr)
+{
+  return (pickshift32(instr, 28, 28, 2) |
+          pickshift32(instr, 24, 24, 1) |
+          pickshift32(instr, 21, 21, 0));
+}
+
+/*
+ * the 8 possible values for bits [28:24:21] in a Data Processing
+ * Register Instruction. Bits [28,25] are always _101
+ */
+
+enum DispatchDPReg {
+  DPREG_LOG_000,     // Logical (shifted register)
+  DPREG_LOG_001,     // ditto
+  DPREG_ADDSHF_010,  // Add/subtract (shifted register)
+  DPREG_ADDEXT_011,  // Add/subtract (extended register)
+  DPREG_ADDCOND_100, // Add/subtract (with carry) AND
+                     // Cond compare/select AND
+                     // Data Processing (1/2 source)
+  DPREG_UNALLOC_101, // Unallocated
+  DPREG_3SRC_110, // Data Processing (3 source)
+  DPREG_3SRC_111  // Data Processing (3 source)
+};
+
+// bits [31,29] of a BrExSys are the secondary dispatch vector
+
+static inline u_int32_t dispatchBrExSys(u_int32_t instr)
+{
+  return pickbits32(instr, 31, 29);
+}
+
+/*
+ * the 8 possible values for bits [31,29] in a Branch/Exception/System
+ * Instruction. Bits [28,25] are always 101_
+ */
+
+enum DispatchBr {
+  BR_IMM_000,     // Unconditional branch (immediate)
+  BR_IMMCMP_001,  // Compare & branch (immediate) AND
+                  // Test & branch (immediate)
+  BR_IMMCOND_010, // Conditional branch (immediate) AND Unallocated
+  BR_UNALLOC_011, // Unallocated
+  BR_IMM_100,     // Unconditional branch (immediate)
+  BR_IMMCMP_101,  // Compare & branch (immediate) AND
+                  // Test & branch (immediate)
+  BR_REG_110,     // Unconditional branch (register) AND System AND
+                  // Excn gen AND Unallocated
+  BR_UNALLOC_111  // Unallocated
+};
+
+/*
+ * TODO still need to provide secondary decode and dispatch for
+ * AdvSIMD Insructions with instr[28,25] = 0111 or 1111
+ */
+
+#endif // ifndef DECODE_H
--- /dev/null	2018-09-25 19:24:41.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/depChecker_aarch32.cpp	2018-09-25 19:24:41.000000000 +0300
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "compiler/disassembler.hpp"
+#include "depChecker_aarch32.hpp"
+
+// Nothing to do on aarch32
--- /dev/null	2018-09-25 19:24:43.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/depChecker_aarch32.hpp	2018-09-25 19:24:42.000000000 +0300
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_DEPCHECKER_AARCH32_HPP
+#define CPU_AARCH32_VM_DEPCHECKER_AARCH32_HPP
+
+// Nothing to do on aarch32
+
+#endif // CPU_AARCH32_VM_DEPCHECKER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:44.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/disassembler_aarch32.hpp	2018-09-25 19:24:43.000000000 +0300
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_DISASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_DISASSEMBLER_AARCH32_HPP
+
+  static int pd_instruction_alignment() {
+    return 1;
+  }
+
+  static const char* pd_cpu_opts() {
+    return "";
+  }
+
+#endif // CPU_AARCH32_VM_DISASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:45.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/frame_aarch32.cpp	2018-09-25 19:24:44.000000000 +0300
@@ -0,0 +1,820 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/markOop.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/javaCalls.hpp"
+#include "runtime/monitorChunk.hpp"
+#include "runtime/os.hpp"
+#include "runtime/signature.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#include "runtime/vframeArray.hpp"
+#endif
+
+#ifdef ASSERT
+void RegisterMap::check_location_valid() {
+}
+#endif
+
+
+// Profiling/safepoint support
+
+bool frame::safe_for_sender(JavaThread *thread) {
+  address   sp = (address)_sp;
+  address   fp = (address)_fp;
+  address   unextended_sp = (address)_unextended_sp;
+
+  // consider stack guards when trying to determine "safe" stack pointers
+  static size_t stack_guard_size = os::uses_stack_guard_pages() ?
+    (JavaThread::stack_red_zone_size() + JavaThread::stack_yellow_zone_size()) : 0;
+  size_t usable_stack_size = thread->stack_size() - stack_guard_size;
+
+  // sp must be within the usable part of the stack (not in guards)
+  bool sp_safe = (sp < thread->stack_base()) &&
+                 (sp >= thread->stack_base() - usable_stack_size);
+
+
+  if (!sp_safe) {
+    return false;
+  }
+
+  // unextended sp must be within the stack and above or equal sp
+  bool unextended_sp_safe = (unextended_sp < thread->stack_base()) &&
+                            (unextended_sp >= sp);
+
+  if (!unextended_sp_safe) {
+    return false;
+  }
+
+  // We know sp/unextended_sp are safe only fp is questionable here
+
+  // If the current frame is known to the code cache then we can attempt to
+  // to construct the sender and do some validation of it. This goes a long way
+  // toward eliminating issues when we get in frame construction code
+
+  if (_cb != NULL ) {
+    // an fp must be within the stack and above (but not equal) sp
+    // second evaluation on fp+ is added to handle situation where fp is -1
+    const bool fp_safe = (fp < thread->stack_base() && (fp > sp) && (((fp + (get_return_addr_offset() * sizeof(void*))) < thread->stack_base())));
+
+
+    // First check if frame is complete and tester is reliable
+    // Unfortunately we can only check frame complete for runtime stubs and nmethod
+    // other generic buffer blobs are more problematic so we just assume they are
+    // ok. adapter blobs never have a frame complete and are never ok.
+
+    if (!_cb->is_frame_complete_at(_pc)) {
+      if (_cb->is_nmethod() || _cb->is_adapter_blob() || _cb->is_runtime_stub()) {
+        return false;
+      }
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!_cb->code_contains(_pc)) {
+      return false;
+    }
+
+    // Entry frame checks
+    if (is_entry_frame()) {
+      // an entry frame must have a valid fp.
+      return fp_safe && is_entry_frame_valid(thread);
+    }
+
+    intptr_t* sender_sp = NULL;
+    intptr_t* sender_unextended_sp = NULL;
+    address   sender_pc = NULL;
+    intptr_t* saved_fp =  NULL;
+
+    if (is_interpreted_frame()) {
+      // fp must be safe
+      if (!fp_safe) {
+        return false;
+      }
+
+      sender_pc = (address) this->fp()[get_return_addr_offset()];
+      // for interpreted frames, the value below is the sender "raw" sp,
+      // which can be different from the sender unextended sp (the sp seen
+      // by the sender) because of current frame local variables
+      sender_sp = (intptr_t*) addr_at(sender_sp_offset);
+      sender_unextended_sp = (intptr_t*) this->fp()[get_interpreter_frame_sender_sp_offset()];
+      saved_fp = (intptr_t*) this->fp()[get_link_offset()];
+
+    } else {
+      // must be some sort of compiled/runtime frame
+      // fp does not have to be safe (although it could be check for c1?)
+
+      // check for a valid frame_size, otherwise we are unlikely to get a valid sender_pc
+      if (_cb->frame_size() <= 0) {
+        return false;
+      }
+
+      sender_sp = _unextended_sp + _cb->frame_size();
+      // Is sender_sp safe?
+      if ((address)sender_sp >= thread->stack_base()) {
+        return false;
+      }
+      sender_unextended_sp = sender_sp;
+      sender_pc = (address) *(sender_sp - 1 + frame::get_return_addr_offset());
+      // Note: frame::sender_sp_offset is only valid for compiled frame
+      saved_fp = (intptr_t*) *(sender_sp - 1 + frame::get_link_offset());
+    }
+
+
+    // If the potential sender is the interpreter then we can do some more checking
+    if (Interpreter::contains(sender_pc)) {
+
+      // fp is always saved in a recognizable place in any code we generate. However
+      // only if the sender is interpreted/call_stub (c1 too?) are we certain that the saved fp
+      // is really a frame pointer.
+
+      bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, sender_unextended_sp, saved_fp, sender_pc);
+
+      return sender.is_interpreted_frame_valid(thread);
+
+    }
+
+    // We must always be able to find a recognizable pc
+    CodeBlob* sender_blob = CodeCache::find_blob_unsafe(sender_pc);
+    if (sender_pc == NULL ||  sender_blob == NULL) {
+      return false;
+    }
+
+    // Could be a zombie method
+    if (sender_blob->is_zombie() || sender_blob->is_unloaded()) {
+      return false;
+    }
+
+    // Could just be some random pointer within the codeBlob
+    if (!sender_blob->code_contains(sender_pc)) {
+      return false;
+    }
+
+    // We should never be able to see an adapter if the current frame is something from code cache
+    if (sender_blob->is_adapter_blob()) {
+      return false;
+    }
+
+    // Could be the call_stub
+    if (StubRoutines::returns_to_call_stub(sender_pc)) {
+      bool saved_fp_safe = ((address)saved_fp < thread->stack_base()) && (saved_fp > sender_sp);
+
+      if (!saved_fp_safe) {
+        return false;
+      }
+
+      // construct the potential sender
+
+      frame sender(sender_sp, sender_unextended_sp, saved_fp, sender_pc);
+
+      // Validate the JavaCallWrapper an entry frame must have
+      address jcw = (address)sender.entry_frame_call_wrapper();
+
+      bool jcw_safe = (jcw < thread->stack_base()) && (jcw > (address)sender.fp());
+
+      return jcw_safe;
+    }
+
+    CompiledMethod* nm = sender_blob->as_compiled_method_or_null();
+    if (nm != NULL) {
+      if (nm->is_deopt_mh_entry(sender_pc) || nm->is_deopt_entry(sender_pc) ||
+          nm->method()->is_method_handle_intrinsic()) {
+        return false;
+      }
+    }
+
+    // If the frame size is 0 something (or less) is bad because every nmethod has a non-zero frame size
+    // because the return address counts against the callee's frame.
+
+    if (sender_blob->frame_size() <= 0) {
+      assert(!sender_blob->is_compiled(), "should count return address at least");
+      return false;
+    }
+
+    // We should never be able to see anything here except an nmethod. If something in the
+    // code cache (current frame) is called by an entity within the code cache that entity
+    // should not be anything but the call stub (already covered), the interpreter (already covered)
+    // or an nmethod.
+
+    if (!sender_blob->is_compiled()) {
+        return false;
+    }
+
+    // Could put some more validation for the potential non-interpreted sender
+    // frame we'd create by calling sender if I could think of any. Wait for next crash in forte...
+
+    // One idea is seeing if the sender_pc we have is one that we'd expect to call to current cb
+
+    // We've validated the potential sender that would be created
+    return true;
+  }
+
+  // an fp must be within the stack and above (but not equal) sp
+  // second evaluation on fp+ is added to handle situation where fp is -1
+  const bool fp_safe = (fp < thread->stack_base() && (fp > sp) && (((fp + (get_return_addr_offset(VMFrameAPCS) * sizeof(void*))) < thread->stack_base())));
+
+  // Must be native-compiled frame. Since sender will try and use fp to find
+  // linkages it must be safe
+
+  if (!fp_safe) {
+    return false;
+  }
+
+  // Will the pc we fetch be non-zero (which we'll find at the oldest frame)
+
+  if ( (address) this->fp()[get_return_addr_offset(VMFrameAPCS)] == NULL) return false;
+
+
+  // could try and do some more potential verification of native frame if we could think of some...
+
+  return true;
+
+}
+
+void frame::patch_pc(Thread* thread, address pc) {
+  address* pc_addr = &(((address*) sp())[-1 + frame::get_return_addr_offset()]);
+  if (TracePcPatching) {
+    tty->print_cr("patch_pc at address " INTPTR_FORMAT " [" INTPTR_FORMAT " -> " INTPTR_FORMAT "]",
+                  p2i(pc_addr), p2i(*pc_addr), p2i(pc));
+  }
+  // Either the return address is the original one or we are going to
+  // patch in the same address that's already there.
+  assert(_pc == *pc_addr || pc == *pc_addr, "must be");
+  *pc_addr = pc;
+  _cb = CodeCache::find_blob(pc);
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    assert(original_pc == _pc, "expected original PC to be stored before patching");
+    _deopt_state = is_deoptimized;
+    // leave _pc as is
+  } else {
+    _deopt_state = not_deoptimized;
+    _pc = pc;
+  }
+}
+
+bool frame::is_interpreted_frame() const  {
+  return Interpreter::contains(pc());
+}
+
+int frame::frame_size(RegisterMap* map) const {
+  frame sender = this->sender(map);
+  return sender.sp() - sp();
+}
+
+intptr_t* frame::entry_frame_argument_at(int offset) const {
+  // convert offset to index to deal with tsi
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  // Entry frame's arguments are always in relation to unextended_sp()
+  return &unextended_sp()[index];
+}
+
+// sender_sp
+intptr_t* frame::interpreter_frame_sender_sp() const {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  return (intptr_t*) at(get_interpreter_frame_sender_sp_offset());
+}
+
+void frame::set_interpreter_frame_sender_sp(intptr_t* sender_sp) {
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  ptr_at_put(get_interpreter_frame_sender_sp_offset(), (intptr_t) sender_sp);
+}
+
+
+// monitor elements
+
+BasicObjectLock* frame::interpreter_frame_monitor_begin() const {
+  return (BasicObjectLock*) addr_at(get_interpreter_frame_monitor_block_bottom_offset());
+}
+
+BasicObjectLock* frame::interpreter_frame_monitor_end() const {
+  BasicObjectLock* result = (BasicObjectLock*) *addr_at(get_interpreter_frame_monitor_block_top_offset());
+  // make sure the pointer points inside the frame
+  assert(sp() <= (intptr_t*) result, "monitor end should be above the stack pointer");
+  assert((intptr_t*) result < fp(),  "monitor end should be strictly below the frame pointer");
+  return result;
+}
+
+void frame::interpreter_frame_set_monitor_end(BasicObjectLock* value) {
+  *((BasicObjectLock**)addr_at(get_interpreter_frame_monitor_block_top_offset())) = value;
+}
+
+// Used by template based interpreter deoptimization
+void frame::interpreter_frame_set_last_sp(intptr_t* sp) {
+    *((intptr_t**)addr_at(get_interpreter_frame_last_sp_offset())) = sp;
+}
+
+frame frame::sender_for_entry_frame(RegisterMap* map) const {
+  assert(map != NULL, "map must be set");
+  // Java frame called from C; skip all C frames and return top C
+  // frame of that chunk as the sender
+  JavaFrameAnchor* jfa = entry_frame_call_wrapper()->anchor();
+  assert(!entry_frame_is_first(), "next Java fp must be non zero");
+  assert(jfa->last_Java_sp() > sp(), "must be above this frame on stack");
+  // Since we are walking the stack now this nested anchor is obviously walkable
+  // even if it wasn't when it was stacked.
+  if (!jfa->walkable()) {
+    // Capture _last_Java_pc (if needed) and mark anchor walkable.
+    jfa->capture_last_Java_pc();
+  }
+  map->clear();
+  assert(map->include_argument_oops(), "should be set by clear");
+  vmassert(jfa->last_Java_pc() != NULL, "not walkable");
+  frame fr(jfa->last_Java_sp(), jfa->last_Java_fp(), jfa->last_Java_pc());
+  return fr;
+}
+
+//------------------------------------------------------------------------------
+// frame::verify_deopt_original_pc
+//
+// Verifies the calculated original PC of a deoptimization PC for the
+// given unextended SP.
+#ifdef ASSERT
+void frame::verify_deopt_original_pc(CompiledMethod* nm, intptr_t* unextended_sp) {
+  frame fr;
+
+  // This is ugly but it's better than to change {get,set}_original_pc
+  // to take an SP value as argument.  And it's only a debugging
+  // method anyway.
+  fr._unextended_sp = unextended_sp;
+
+  address original_pc = nm->get_original_pc(&fr);
+  assert(nm->insts_contains_inclusive(original_pc),
+         "original PC must be in the main code section of the the compiled method (or must be immediately following it)");
+}
+#endif
+
+//------------------------------------------------------------------------------
+// frame::adjust_unextended_sp
+void frame::adjust_unextended_sp() {
+  // On aarch32, sites calling method handle intrinsics and lambda forms are treated
+  // as any other call site. Therefore, no special action is needed when we are
+  // returning to any of these call sites.
+
+  if (_cb != NULL) {
+    CompiledMethod* sender_cm = _cb->as_compiled_method_or_null();
+    if (sender_cm != NULL) {
+      // If the sender PC is a deoptimization point, get the original PC.
+      if (sender_cm->is_deopt_entry(_pc) ||
+          sender_cm->is_deopt_mh_entry(_pc)) {
+        DEBUG_ONLY(verify_deopt_original_pc(sender_cm, _unextended_sp));
+      }
+    }
+  }
+}
+
+//------------------------------------------------------------------------------
+// frame::update_map_with_saved_link
+void frame::update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr) {
+  // The interpreter and compiler(s) always save fp in a known
+  // location on entry. We must record where that location is
+  // so that if fp was live on callout from c2 we can find
+  // the saved copy no matter what it called.
+
+  // Since the interpreter always saves fp if we record where it is then
+  // we don't have to always save fp on entry and exit to c2 compiled
+  // code, on entry will be enough.
+  map->set_location(rfp->as_VMReg(), (address) link_addr);
+}
+
+
+//------------------------------------------------------------------------------
+// frame::sender_for_interpreter_frame
+frame frame::sender_for_interpreter_frame(RegisterMap* map) const {
+  // SP is the raw SP from the sender after adapter or interpreter
+  // extension.
+  intptr_t* sender_sp = this->sender_sp();
+
+  // This is the sp before any possible extension (adapter/locals).
+  intptr_t* unextended_sp = interpreter_frame_sender_sp();
+
+#if COMPILER2_OR_JVMCI
+  if (map->update_map()) {
+    update_map_with_saved_link(map, (intptr_t**) addr_at(get_link_offset()));
+  }
+#endif // COMPILER2_OR_JVMCI
+
+  address sender_pc = *(address*) addr_at(get_return_addr_offset());
+  intptr_t *link = *(intptr_t **)addr_at(get_link_offset());
+
+  return frame(sender_sp, unextended_sp, link, sender_pc);
+}
+
+
+//------------------------------------------------------------------------------
+// frame::sender_for_compiled_frame
+frame frame::sender_for_compiled_frame(RegisterMap* map) const {
+  // we cannot rely upon the last fp having been saved to the thread
+  // in C2 code but it will have been pushed onto the stack. so we
+  // have to find it relative to the unextended sp
+
+  assert(_cb->frame_size() >= 0, "must have non-zero frame size");
+  intptr_t* l_sender_sp = unextended_sp() + _cb->frame_size();
+  intptr_t* unextended_sp = l_sender_sp;
+
+  // the return_address is always the word on the stack
+  address sender_pc = (address) *(l_sender_sp - 1 + get_return_addr_offset());
+
+  intptr_t** saved_fp_addr = (intptr_t**)(l_sender_sp - 1 + get_link_offset());
+
+  // assert (sender_sp() == l_sender_sp, "should be");
+  // assert (*saved_fp_addr == link(), "should be");
+
+  if (map->update_map()) {
+    // Tell GC to use argument oopmaps for some runtime stubs that need it.
+    // For C1, the runtime stub might not have oop maps, so set this flag
+    // outside of update_register_map.
+    map->set_include_argument_oops(_cb->caller_must_gc_arguments(map->thread()));
+    if (_cb->oop_maps() != NULL) {
+      OopMapSet::update_register_map(this, map);
+    }
+
+    // Since the prolog does the save and restore of FP there is no
+    // oopmap for it so we must fill in its location as if there was
+    // an oopmap entry since if our caller was compiled code there
+    // could be live jvm state in it.
+    update_map_with_saved_link(map, saved_fp_addr);
+  }
+
+  return frame(l_sender_sp, unextended_sp, *saved_fp_addr, sender_pc);
+}
+
+//------------------------------------------------------------------------------
+// frame::sender
+frame frame::sender(RegisterMap* map) const {
+  // Default is we done have to follow them. The sender_for_xxx will
+  // update it accordingly
+  map->set_include_argument_oops(false);
+
+  if (is_entry_frame()) {
+    return sender_for_entry_frame(map);
+  }
+  if (is_interpreted_frame()) {
+    return sender_for_interpreter_frame(map);
+  }
+  assert(_cb == CodeCache::find_blob(pc()),"Must be the same");
+
+  // This test looks odd: why is it not is_compiled_frame() ?  That's
+  // because stubs also have OOP maps.
+  if (_cb != NULL) {
+    return sender_for_compiled_frame(map);
+  }
+
+  // Must be native-compiled frame, i.e. the marshaling code for native
+  // methods that exists in the core system.
+  intptr_t *link = *(intptr_t**) addr_at(get_link_offset(VMFrameAPCS));
+  return frame(sender_sp(), link, sender_pc());
+}
+
+bool frame::is_interpreted_frame_valid(JavaThread* thread) const {
+  assert(is_interpreted_frame(), "Not an interpreted frame");
+  // These are reasonable sanity checks
+  if (fp() == 0 || (intptr_t(fp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (sp() == 0 || (intptr_t(sp()) & (wordSize-1)) != 0) {
+    return false;
+  }
+  if (fp() + get_interpreter_frame_initial_sp_offset() < sp()) {
+    return false;
+  }
+  // These are hacks to keep us out of trouble.
+  // The problem with these is that they mask other problems
+  if (fp() <= sp()) {        // this attempts to deal with unsigned comparison above
+    return false;
+  }
+
+  // do some validation of frame elements
+
+  // first the method
+
+  Method* m = *interpreter_frame_method_addr();
+
+  // validate the method we'd find in this potential sender
+  if (!m->is_valid_method()) return false;
+
+  // stack frames shouldn't be much larger than max_stack elements
+  // this test requires the use of unextended_sp which is the sp as seen by
+  // the current frame, and not sp which is the "raw" pc which could point
+  // further because of local variables of the callee method inserted after
+  // method arguments
+  if (fp() - unextended_sp() > 1024 + m->max_stack()*Interpreter::stackElementSize) {
+    return false;
+  }
+
+  // validate bci/bcp
+  address  bcp    = interpreter_frame_bcp();
+  if (m->validate_bci_from_bcp(bcp) < 0) {
+    return false;
+  }
+
+  // validate constantPoolCache*
+  ConstantPoolCache* cp = *interpreter_frame_cache_addr();
+  if (cp == NULL || !cp->is_metaspace_object()) return false;
+
+  // validate locals
+
+  address locals =  (address) *interpreter_frame_locals_addr();
+
+  if (locals > thread->stack_base() || locals < (address) fp()) return false;
+
+  // We'd have to be pretty unlucky to be mislead at this point
+
+  return true;
+}
+
+BasicType frame::interpreter_frame_result(oop* oop_result, jvalue* value_result) {
+#ifdef CC_INTERP
+  // Needed for JVMTI. The result should always be in the
+  // interpreterState object
+  interpreterState istate = get_interpreterState();
+#endif // CC_INTERP
+  assert(is_interpreted_frame(), "interpreted frame expected");
+  Method* method = interpreter_frame_method();
+  BasicType type = method->result_type();
+
+  intptr_t* tos_addr;
+  if (method->is_native()) {
+    tos_addr = (intptr_t*)sp();
+    if (type == T_FLOAT || type == T_DOUBLE) {
+      // This is times two because we do a push(ltos) after pushing D0
+      // and that takes two interpreter stack slots.
+#ifdef HARD_FLOAT_CC
+      tos_addr += 2 * Interpreter::stackElementWords;
+#endif
+    }
+  } else {
+    tos_addr = (intptr_t*)interpreter_frame_tos_address();
+  }
+
+  switch (type) {
+    case T_OBJECT  :
+    case T_ARRAY   : {
+      oop obj;
+      if (method->is_native()) {
+        obj = cast_to_oop(at(interpreter_frame_oop_temp_offset));
+      } else {
+        oop* obj_p = (oop*)tos_addr;
+        obj = (obj_p == NULL) ? (oop)NULL : *obj_p;
+      }
+      assert(obj == NULL || Universe::heap()->is_in(obj), "sanity check");
+      *oop_result = obj;
+      break;
+    }
+    case T_BOOLEAN : value_result->z = *(jboolean*)tos_addr; break;
+    case T_BYTE    : value_result->b = *(jbyte*)tos_addr; break;
+    case T_CHAR    : value_result->c = *(jchar*)tos_addr; break;
+    case T_SHORT   : value_result->s = *(jshort*)tos_addr; break;
+    case T_INT     : value_result->i = *(jint*)tos_addr; break;
+    case T_LONG    : value_result->j = *(jlong*)tos_addr; break;
+    case T_FLOAT   : {
+        value_result->f = *(jfloat*)tos_addr;
+      break;
+    }
+    case T_DOUBLE  : value_result->d = *(jdouble*)tos_addr; break;
+    case T_VOID    : /* Nothing to do */ break;
+    default        : ShouldNotReachHere();
+  }
+
+  return type;
+}
+
+
+intptr_t* frame::interpreter_frame_tos_at(jint offset) const {
+  int index = (Interpreter::expr_offset_in_bytes(offset)/wordSize);
+  return &interpreter_frame_tos_address()[index];
+}
+
+#ifndef PRODUCT
+
+#define DESCRIBE_FP_OFFSET(name) \
+    values.describe(frame_no, fp() + frame::get_##name##_offset(), #name)
+
+void frame::describe_pd(FrameValues& values, int frame_no) {
+  if (is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_last_sp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_method);
+    DESCRIBE_FP_OFFSET(interpreter_frame_mdp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_mirror);
+    DESCRIBE_FP_OFFSET(interpreter_frame_cache);
+    DESCRIBE_FP_OFFSET(interpreter_frame_locals);
+    DESCRIBE_FP_OFFSET(interpreter_frame_bcp);
+    DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp);
+  }
+}
+
+#endif // PRODUCT
+
+intptr_t *frame::initial_deoptimization_info() {
+  return real_fp();
+}
+
+intptr_t* frame::real_fp() const {
+  // Currently we have a fp for all frames
+  if (_cb != NULL) {
+    // use the frame size if valid
+    int size = _cb->frame_size();
+    if (size > 0) {
+      return unextended_sp() + size;
+    }
+  }
+  // else rely on fp()
+  assert(! is_compiled_frame(), "unknown compiled frame size");
+  return fp();
+}
+
+#undef DESCRIBE_FP_OFFSET
+
+#define NO_PARAM
+#define DESCRIBE_FP_OFFSET(name, param)                                                      \
+  {                                                                                          \
+    unsigned long *p = (unsigned long *)fp;                                                  \
+    printf("0x%016lx 0x%016lx %s\n", (unsigned long)(p + frame::get_##name##_offset(param)), \
+           p[frame::get_##name##_offset(param)], #name);                                     \
+  }
+
+static __thread unsigned long nextfp;
+static __thread unsigned long nextpc;
+static __thread unsigned long nextsp;
+static __thread RegisterMap *reg_map;
+
+static void printbc(Method *m, intptr_t bcp) {
+  const char *name;
+  char buf[16];
+  if (m->validate_bci_from_bcp((address)bcp) < 0 || !m->contains((address) bcp)) {
+    name = "???";
+    snprintf(buf, sizeof buf, "(bad)");
+  } else {
+    int bci = m->bci_from((address)bcp);
+    snprintf(buf, sizeof buf, "%d", bci);
+    name = Bytecodes::name(m->code_at(bci));
+  }
+  ResourceMark rm;
+  printf("%s : %s ==> %s\n", m->name_and_sig_as_C_string(), buf, name);
+}
+
+void internal_pf(unsigned long sp, unsigned long fp, unsigned long pc, unsigned long bcx) {
+  if (! fp)
+    return;
+
+  DESCRIBE_FP_OFFSET(interpreter_frame_sender_sp, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_last_sp, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_method, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_mdp, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_cache, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_locals, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_bcp, NO_PARAM);
+  DESCRIBE_FP_OFFSET(interpreter_frame_initial_sp, NO_PARAM);
+
+  unsigned long *p = (unsigned long *)fp;
+
+  // We want to see all frames, native and Java.  For compiled and
+  // interpreted frames we have special information that allows us to
+  // unwind them; for everything else we assume that the native frame
+  // pointer chain is intact.
+  frame this_frame((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+  if (this_frame.is_compiled_frame() ||
+      this_frame.is_interpreted_frame()) {
+    DESCRIBE_FP_OFFSET(return_addr, FrameAPCS);
+    DESCRIBE_FP_OFFSET(link, FrameAPCS);
+    frame sender = this_frame.sender(reg_map);
+    nextfp = (unsigned long)sender.fp();
+    nextpc = (unsigned long)sender.pc();
+    nextsp = (unsigned long)sender.unextended_sp();
+  } else {
+    DESCRIBE_FP_OFFSET(return_addr, VMFrameAPCS);
+    DESCRIBE_FP_OFFSET(link, VMFrameAPCS);
+    nextfp = p[frame::get_link_offset(VMFrameAPCS)];
+    nextpc = p[frame::get_return_addr_offset(VMFrameAPCS)];
+    nextsp = (unsigned long)&p[frame::sender_sp_offset];
+  }
+
+  if (bcx == -1ul) {
+    bcx = p[frame::get_interpreter_frame_bcp_offset()];
+  }
+
+  if (Interpreter::contains((address)pc)) {
+    Method* m = (Method*)p[frame::get_interpreter_frame_method_offset()];
+    if(m && m->is_method()) {
+      printbc(m, bcx);
+    } else
+      printf("not a Method\n");
+  } else {
+    CodeBlob *cb = CodeCache::find_blob((address)pc);
+    if (cb != NULL) {
+      if (cb->is_nmethod()) {
+        ResourceMark rm;
+        nmethod* nm = (nmethod*)cb;
+        printf("nmethod %s\n", nm->method()->name_and_sig_as_C_string());
+      } else if (cb->name()) {
+        printf("CodeBlob %s\n", cb->name());
+      }
+    }
+  }
+}
+
+extern "C" void npf() {
+  CodeBlob *cb = CodeCache::find_blob((address)nextpc);
+  // C2 does not always chain the frame pointers when it can, instead
+  // preferring to use fixed offsets from SP, so a simple leave() does
+  // not work.  Instead, it adds the frame size to SP then pops FP and
+  // LR.  We have to do the same thing to get a good call chain.
+  if (cb && cb->frame_size())
+    nextfp = nextsp + wordSize * (cb->frame_size() - 2);
+  internal_pf (nextsp, nextfp, nextpc, -1);
+}
+
+extern "C" void pf(unsigned long sp, unsigned long fp, unsigned long pc,
+                   unsigned long bcx, unsigned long thread) {
+  RegisterMap map((JavaThread*)thread, false);
+  if (!reg_map) {
+    reg_map = (RegisterMap*)os::malloc(sizeof map, mtNone);
+  }
+  memcpy(reg_map, &map, sizeof map);
+  {
+    CodeBlob *cb = CodeCache::find_blob((address)pc);
+    if (cb && cb->frame_size())
+      fp = sp + wordSize * (cb->frame_size() - 2);
+  }
+  internal_pf(sp, fp, pc, bcx);
+}
+
+// support for printing out where we are in a Java method
+// needs to be passed current fp and bcp register values
+// prints method name, bc index and bytecode name
+extern "C" void pm(unsigned long fp, unsigned long bcx) {
+  DESCRIBE_FP_OFFSET(interpreter_frame_method, NO_PARAM);
+  unsigned long *p = (unsigned long *)fp;
+  Method* m = (Method*)p[frame::get_interpreter_frame_method_offset()];
+  printbc(m, bcx);
+}
+
+#ifndef PRODUCT
+// This is a generic constructor which is only used by pns() in debug.cpp.
+frame::frame(void* sp, void* fp, void* pc) {
+  init((intptr_t*)sp, (intptr_t*)fp, (address)pc);
+}
+
+void frame::pd_ps() {}
+#endif
+
+void JavaFrameAnchor::make_walkable(JavaThread* thread) {
+  // last frame set?
+  if (last_Java_sp() == NULL) return;
+  // already walkable?
+  if (walkable()) return;
+  vmassert(Thread::current() == (Thread*)thread, "not current thread");
+  vmassert(last_Java_sp() != NULL, "not called from Java code?");
+  vmassert(last_Java_pc() == NULL, "already walkable");
+  capture_last_Java_pc();
+  vmassert(walkable(), "something went wrong");
+}
+
+void JavaFrameAnchor::capture_last_Java_pc() {
+  vmassert(_last_Java_sp != NULL, "no last frame set");
+  vmassert(_last_Java_pc == NULL, "already walkable");
+  _last_Java_pc = (address)_last_Java_sp[-1];
+}
--- /dev/null	2018-09-25 19:24:46.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/frame_aarch32.hpp	2018-09-25 19:24:46.000000000 +0300
@@ -0,0 +1,189 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_FRAME_AARCH32_HPP
+#define CPU_AARCH32_VM_FRAME_AARCH32_HPP
+
+#include "runtime/synchronizer.hpp"
+
+// A frame represents a physical stack frame (an activation).  Frames can be
+// C or Java frames, and the Java frames can be interpreted or compiled.
+// In contrast, vframes represent source-level activations, so that one physical frame
+// can correspond to multiple source level frames because of inlining.
+// A frame is comprised of {pc, fp, sp}
+// ------------------------------ Asm interpreter ----------------------------------------
+// Layout of asm interpreter frame:
+//    [expression stack      ] * <- sp
+
+//    [monitors[0]           ]   \
+//     ...                        | monitor block size = k
+//    [monitors[k-1]         ]   /
+//    [frame initial esp     ] ( == &monitors[0], initially here)       initial_sp_offset
+//    [byte code index/pointr]                   = bcx()                bcx_offset
+
+//    [pointer to locals     ]                   = locals()             locals_offset
+//    [constant pool cache   ]                   = cache()              cache_offset
+
+//    [klass of method       ]                   = mirror()             mirror_offset
+//    [padding               ]
+
+//    [methodData            ]                   = mdp()                mdx_offset
+//    [methodOop             ]                   = method()             method_offset
+
+//    [last esp              ]                   = last_sp()            last_sp_offset
+//    [old stack pointer     ]                     (sender_sp)          sender_sp_offset
+
+//    [old frame pointer     ]                   = link()
+//    [return pc             ]   <- fp
+
+//    [last sp               ]
+//    [oop temp              ]                     (only for native calls)
+
+//    [locals and parameters ]
+//                               <- sender sp
+// ------------------------------ Asm interpreter ----------------------------------------
+
+ public:
+  enum {
+    pc_return_offset                                 =  0,
+    // All frames
+    sender_sp_offset                                 =  1,
+
+    // we don't need a save area
+    arg_reg_save_area_bytes                          =  0,
+
+    // Interpreter frames
+    interpreter_frame_oop_temp_offset                = 2, // for native calls only
+
+    interpreter_frame_sender_sp_offset               = 0,
+    // outgoing sp before a call to an invoked method
+    interpreter_frame_last_sp_offset                 = interpreter_frame_sender_sp_offset - 1,
+    interpreter_frame_method_offset                  = interpreter_frame_last_sp_offset - 1,
+    interpreter_frame_mdp_offset                     = interpreter_frame_method_offset - 1,
+    interpreter_frame_padding_offset                 = interpreter_frame_mdp_offset - 1,
+    interpreter_frame_mirror_offset                  = interpreter_frame_padding_offset - 1,
+    interpreter_frame_cache_offset                   = interpreter_frame_mirror_offset - 1,
+    interpreter_frame_locals_offset                  = interpreter_frame_cache_offset - 1,
+    interpreter_frame_bcp_offset                     = interpreter_frame_locals_offset - 1,
+    interpreter_frame_initial_sp_offset              = interpreter_frame_bcp_offset - 1,
+
+    interpreter_frame_monitor_block_top_offset       = interpreter_frame_initial_sp_offset,
+    interpreter_frame_monitor_block_bottom_offset    = interpreter_frame_initial_sp_offset,
+  };
+
+  static int get_frame_size(bool apcs = FrameAPCS) {
+    return apcs ? 4 : 2;
+  }
+
+  static int get_link_offset(bool apcs = FrameAPCS) {
+    return apcs ? -3 : -1;
+  }
+
+  static int get_return_addr_offset(bool apcs = FrameAPCS) {
+    return apcs ? -1 : 0;
+  }
+
+  // Entry frames
+  // n.b. these values are determined by the layout defined in
+  // stubGenerator for the Java call stub
+  static int get_entry_frame_after_call_words()                  { return (get_frame_size(VMFrameAPCS) + 10) + (StackAlignmentInBytes/BytesPerWord); }
+  static int get_entry_frame_call_wrapper_offset()               { return -(get_frame_size(VMFrameAPCS) + 10); }
+
+  static int get_offset_from_rfp_bytes()                         { return wordSize * (get_frame_size(VMFrameAPCS) - 1); }
+  static int get_interpreter_frame_oop_temp_offset()             { return interpreter_frame_oop_temp_offset; }
+  static int get_interpreter_frame_sender_sp_offset()            { return -get_frame_size() + interpreter_frame_sender_sp_offset; }
+  static int get_interpreter_frame_last_sp_offset()              { return -get_frame_size() + interpreter_frame_last_sp_offset; }
+  static int get_interpreter_frame_method_offset()               { return -get_frame_size() + interpreter_frame_method_offset; }
+  static int get_interpreter_frame_mdp_offset()                  { return -get_frame_size() + interpreter_frame_mdp_offset; }
+  static int get_interpreter_frame_padding_offset()              { return -get_frame_size() + interpreter_frame_padding_offset; }
+  static int get_interpreter_frame_mirror_offset()               { return -get_frame_size() + interpreter_frame_mirror_offset; }
+  static int get_interpreter_frame_cache_offset()                { return -get_frame_size() + interpreter_frame_cache_offset; }
+  static int get_interpreter_frame_locals_offset()               { return -get_frame_size() + interpreter_frame_locals_offset; }
+  static int get_interpreter_frame_bcp_offset()                  { return -get_frame_size() + interpreter_frame_bcp_offset; }
+  static int get_interpreter_frame_initial_sp_offset()           { return -get_frame_size() + interpreter_frame_initial_sp_offset; }
+  static int get_interpreter_frame_monitor_block_top_offset()    { return -get_frame_size() + interpreter_frame_monitor_block_top_offset; }
+  static int get_interpreter_frame_monitor_block_bottom_offset() { return -get_frame_size() + interpreter_frame_monitor_block_bottom_offset; }
+
+  intptr_t ptr_at(int offset) const {
+    return *ptr_at_addr(offset);
+  }
+
+  void ptr_at_put(int offset, intptr_t value) {
+    *ptr_at_addr(offset) = value;
+  }
+
+ private:
+  // an additional field beyond _sp and _pc:
+  intptr_t*   _fp; // frame pointer
+  // The interpreter and adapters will extend the frame of the caller.
+  // Since oopMaps are based on the sp of the caller before extension
+  // we need to know that value. However in order to compute the address
+  // of the return address we need the real "raw" sp. Since sparc already
+  // uses sp() to mean "raw" sp and unextended_sp() to mean the caller's
+  // original sp we use that convention.
+
+  intptr_t*     _unextended_sp;
+  void adjust_unextended_sp();
+
+  intptr_t* ptr_at_addr(int offset) const {
+    return (intptr_t*) addr_at(offset);
+  }
+
+#ifdef ASSERT
+  // Used in frame::sender_for_{interpreter,compiled}_frame
+  static void verify_deopt_original_pc(   CompiledMethod* nm, intptr_t* unextended_sp);
+#endif
+
+ public:
+  // Constructors
+
+  frame(intptr_t* sp, intptr_t* fp, address pc);
+
+  frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc);
+
+  frame(intptr_t* sp, intptr_t* fp);
+
+  void init(intptr_t* sp, intptr_t* fp, address pc);
+
+  // accessors for the instance variables
+  // Note: not necessarily the real 'frame pointer' (see real_fp)
+  intptr_t*   fp() const { return _fp; }
+
+  inline address* sender_pc_addr() const;
+
+  // expression stack tos if we are nested in a java call
+  intptr_t* interpreter_frame_last_sp() const;
+
+  // helper to update a map with callee-saved RBP
+  static void update_map_with_saved_link(RegisterMap* map, intptr_t** link_addr);
+
+  // deoptimization support
+  void interpreter_frame_set_last_sp(intptr_t* sp);
+
+  static jint interpreter_frame_expression_stack_direction() { return -1; }
+
+#endif // CPU_AARCH32_VM_FRAME_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:47.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/frame_aarch32.inline.hpp	2018-09-25 19:24:47.000000000 +0300
@@ -0,0 +1,249 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_FRAME_AARCH32_INLINE_HPP
+#define CPU_AARCH32_VM_FRAME_AARCH32_INLINE_HPP
+
+#include "code/codeCache.hpp"
+#include "code/vmreg.inline.hpp"
+
+// Inline functions for AArch64 frames:
+
+// Constructors:
+
+inline frame::frame() {
+  _pc = NULL;
+  _sp = NULL;
+  _unextended_sp = NULL;
+  _fp = NULL;
+  _cb = NULL;
+  _deopt_state = unknown;
+}
+
+static int spin;
+
+inline void frame::init(intptr_t* sp, intptr_t* fp, address pc) {
+  intptr_t a = intptr_t(sp);
+  intptr_t b = intptr_t(fp);
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* fp, address pc) {
+  init(sp, fp, pc);
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* unextended_sp, intptr_t* fp, address pc) {
+  intptr_t a = intptr_t(sp);
+  intptr_t b = intptr_t(fp);
+  _sp = sp;
+  _unextended_sp = unextended_sp;
+  _fp = fp;
+  _pc = pc;
+  assert(pc != NULL, "no pc?");
+  _cb = CodeCache::find_blob(pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    assert(_cb->as_compiled_method()->insts_contains_inclusive(_pc),
+           "original PC must be in the main code section of the the compiled method (or must be immediately following it)");
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+inline frame::frame(intptr_t* sp, intptr_t* fp) {
+  _sp = sp;
+  _unextended_sp = sp;
+  _fp = fp;
+  assert(sp != NULL, "null SP ?");
+  // C2 generated code does not use or set fp
+  _pc = (address)(/*fp != NULL ? fp[0] : */sp[-1]);
+
+  // Here's a sticky one. This constructor can be called via AsyncGetCallTrace
+  // when last_Java_sp is non-null but the pc fetched is junk. If we are truly
+  // unlucky the junk value could be to a zombied method and we'll die on the
+  // find_blob call. This is also why we can have no asserts on the validity
+  // of the pc we find here. AsyncGetCallTrace -> pd_get_top_frame_for_signal_handler
+  // -> pd_last_frame should use a specialized version of pd_last_frame which could
+  // call a specilaized frame constructor instead of this one.
+  // Then we could use the assert below. However this assert is of somewhat dubious
+  // value.
+  // assert(_pc != NULL, "no pc?");
+
+  _cb = CodeCache::find_blob(_pc);
+  adjust_unextended_sp();
+
+  address original_pc = CompiledMethod::get_deopt_original_pc(this);
+  if (original_pc != NULL) {
+    _pc = original_pc;
+    _deopt_state = is_deoptimized;
+  } else {
+    _deopt_state = not_deoptimized;
+  }
+}
+
+// Accessors
+
+inline bool frame::equal(frame other) const {
+  bool ret =  sp() == other.sp()
+              && unextended_sp() == other.unextended_sp()
+              && fp() == other.fp()
+              && pc() == other.pc();
+  assert(!ret || ret && cb() == other.cb() && _deopt_state == other._deopt_state, "inconsistent construction");
+  return ret;
+}
+
+// Return unique id for this frame. The id must have a value where we can distinguish
+// identity and younger/older relationship. NULL represents an invalid (incomparable)
+// frame.
+inline intptr_t* frame::id(void) const { return unextended_sp(); }
+
+// Relationals on frames based
+// Return true if the frame is younger (more recent activation) than the frame represented by id
+inline bool frame::is_younger(intptr_t* id) const { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() < id ; }
+
+// Return true if the frame is older (less recent activation) than the frame represented by id
+inline bool frame::is_older(intptr_t* id) const   { assert(this->id() != NULL && id != NULL, "NULL frame id");
+                                                    return this->id() > id ; }
+
+inline intptr_t* frame::link() const              { return *(intptr_t **)addr_at(get_link_offset(VMFrameAPCS)); }
+
+inline intptr_t* frame::unextended_sp() const     { return _unextended_sp; }
+
+inline address  frame::sender_pc() const          { return *(address*) addr_at(get_return_addr_offset(VMFrameAPCS)); }
+
+inline intptr_t*    frame::sender_sp()        const { return            addr_at(sender_sp_offset); }
+
+inline intptr_t** frame::interpreter_frame_locals_addr() const {
+  return (intptr_t**)addr_at(get_interpreter_frame_locals_offset());
+}
+
+inline intptr_t* frame::interpreter_frame_last_sp() const {
+  return *(intptr_t**)addr_at(get_interpreter_frame_last_sp_offset());
+}
+
+inline intptr_t* frame::interpreter_frame_bcp_addr() const {
+  return (intptr_t*) addr_at(get_interpreter_frame_bcp_offset());
+}
+
+inline intptr_t* frame::interpreter_frame_mdp_addr() const {
+  return (intptr_t*) addr_at(get_interpreter_frame_mdp_offset());
+}
+
+
+// Constant pool cache
+
+inline ConstantPoolCache** frame::interpreter_frame_cache_addr() const {
+  return (ConstantPoolCache**)addr_at(get_interpreter_frame_cache_offset());
+}
+
+// Method
+
+inline Method** frame::interpreter_frame_method_addr() const {
+  return (Method**)addr_at(get_interpreter_frame_method_offset());
+}
+
+// Mirror
+
+inline oop* frame::interpreter_frame_mirror_addr() const {
+  return (oop*)addr_at(get_interpreter_frame_mirror_offset());
+}
+
+// top of expression stack
+inline intptr_t* frame::interpreter_frame_tos_address() const {
+  intptr_t* last_sp = interpreter_frame_last_sp();
+  if (last_sp == NULL) {
+    return sp();
+  } else {
+    // sp() may have been extended or shrunk by an adapter.  At least
+    // check that we don't fall behind the legal region.
+    // For top deoptimized frame last_sp == interpreter_frame_monitor_end.
+    assert(last_sp <= (intptr_t*) interpreter_frame_monitor_end(), "bad tos");
+    return last_sp;
+  }
+}
+
+inline oop* frame::interpreter_frame_temp_oop_addr() const {
+  return (oop *)(fp() + interpreter_frame_oop_temp_offset);
+}
+
+inline int frame::interpreter_frame_monitor_size() {
+  return BasicObjectLock::size();
+}
+
+
+// expression stack
+// (the max_stack arguments are used by the GC; see class FrameClosure)
+
+inline intptr_t* frame::interpreter_frame_expression_stack() const {
+  intptr_t* monitor_end = (intptr_t*) interpreter_frame_monitor_end();
+  return monitor_end-1;
+}
+
+
+// Entry frames
+
+inline JavaCallWrapper** frame::entry_frame_call_wrapper_addr() const {
+ return (JavaCallWrapper**)addr_at(get_entry_frame_call_wrapper_offset());
+}
+
+
+// Compiled frames
+
+inline oop frame::saved_oop_result(RegisterMap* map) const {
+  oop* result_adr = (oop *)map->location(r0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+
+  return (*result_adr);
+}
+
+inline void frame::set_saved_oop_result(RegisterMap* map, oop obj) {
+  oop* result_adr = (oop *)map->location(r0->as_VMReg());
+  guarantee(result_adr != NULL, "bad register save location");
+
+  *result_adr = obj;
+}
+
+#endif // CPU_AARCH32_VM_FRAME_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:24:48.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/g1/g1BarrierSetAssembler_aarch32.cpp	2018-09-25 19:24:48.000000000 +0300
@@ -0,0 +1,471 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "c1/c1_LIRAssembler.hpp"
+#include "c1/c1_MacroAssembler.hpp"
+#include "gc/g1/c1/g1BarrierSetC1.hpp"
+#include "gc/g1/g1BarrierSet.hpp"
+#include "gc/g1/g1BarrierSetAssembler.hpp"
+#include "gc/g1/g1BarrierSetRuntime.hpp"
+#include "gc/g1/g1CardTable.hpp"
+#include "gc/g1/g1ThreadLocalData.hpp"
+#include "gc/g1/heapRegion.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/thread.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+#define __ masm->
+
+void G1BarrierSetAssembler::gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                                            Register addr, Register count) {
+  bool dest_uninitialized = (decorators & IS_DEST_UNINITIALIZED) != 0;
+  // With G1, don't generate the call if we statically know that the target in uninitialized
+  if (!dest_uninitialized) {
+    __ push(RegSet::range(r0, r3), sp);
+    if (count == c_rarg0) {
+      if (addr == c_rarg1) {
+        // exactly backwards!!
+        __ eor(c_rarg0, c_rarg0, c_rarg1);
+        __ eor(c_rarg1, c_rarg0, c_rarg1);
+        __ eor(c_rarg0, c_rarg0, c_rarg1);
+      } else {
+        __ mov(c_rarg1, count);
+        __ mov(c_rarg0, addr);
+      }
+    } else {
+      __ mov(c_rarg0, addr);
+      __ mov(c_rarg1, count);
+    }
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_pre_oop_entry), 2);
+    __ pop(RegSet::range(r0, r3), sp);
+  }
+}
+
+void G1BarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                                             Register start, Register end, Register scratch) {
+  // must compute element count unless barrier set interface is changed (other platforms supply count)
+  assert_different_registers(start, end, scratch);
+  __ lea(scratch, Address(end, BytesPerHeapOop));
+  __ sub(scratch, scratch, start);               // subtract start to get #bytes
+  __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
+  __ mov(c_rarg0, start);
+  __ mov(c_rarg1, scratch);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_array_post_entry), 2);
+}
+
+
+void G1BarrierSetAssembler::g1_write_barrier_pre(MacroAssembler* masm,
+                                                 Address obj,
+                                                 Register pre_val,
+                                                 Register thread,
+                                                 Register tmp,
+                                                 bool tosca_live,
+                                                 bool expand_call) {
+  // If expand_call is true then we expand the call_VM_leaf macro
+  // directly to skip generating the check by
+  // InterpreterMacroAssembler::call_VM_leaf_base that checks _last_sp.
+
+  assert(thread == rthread, "must be");
+
+  Label done;
+  Label runtime;
+
+  assert(pre_val != noreg, "check this code");
+
+  assert_different_registers(pre_val, tmp);
+  if (obj.get_mode() != Address::no_mode)
+    assert(!obj.uses(pre_val) && !obj.uses(tmp), "destroys register");
+
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  Address index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
+  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
+
+
+  // Is marking active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ ldr(tmp, in_progress);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ ldrb(tmp, in_progress);
+  }
+  __ cbz(tmp, done);
+
+  // Do we need to load the previous value?
+  if (obj.get_mode() != Address::no_mode) {
+    __ load_heap_oop(pre_val, obj, noreg, noreg, AS_RAW);
+  }
+
+  // Is the previous value null?
+  __ cbz(pre_val, done);
+
+  // Can we store original value in the thread's buffer?
+  // Is index == 0?
+  // (The index field is typed as size_t.)
+
+  __ ldr(tmp, index);                      // tmp := *index_adr
+  __ cbz(tmp, runtime);                    // tmp == 0?
+                                        // If yes, goto runtime
+
+  __ sub(tmp, tmp, wordSize);              // tmp := tmp - wordSize
+  __ str(tmp, index);                      // *index_adr := tmp
+  __ ldr(rscratch1, buffer);
+  __ add(tmp, tmp, rscratch1);             // tmp := tmp + *buffer_adr
+
+  // Record the previous value
+  __ str(pre_val, Address(tmp));
+  __ b(done);
+
+  __ bind(runtime);
+  // save the live input values
+  __ push(r0->bit(tosca_live) | obj.reg_bits() | pre_val->bit(true) | lr->bit(true), sp);
+
+  // Calling the runtime using the regular call_VM_leaf mechanism generates
+  // code (generated by InterpreterMacroAssember::call_VM_leaf_base)
+  // that checks that the *(rfp+frame::interpreter_frame_last_sp) == NULL.
+  //
+  // If we care generating the pre-barrier without a frame (e.g. in the
+  // intrinsified Reference.get() routine) then ebp might be pointing to
+  // the caller frame and so this check will most likely fail at runtime.
+  //
+  // Expanding the call directly bypasses the generation of the check.
+  // So when we do not have have a full interpreter frame on the stack
+  // expand_call should be passed true.
+
+  if (expand_call) {
+    assert(pre_val != c_rarg1, "smashed arg");
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), pre_val, thread);
+  } else {
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), pre_val, thread);
+  }
+
+  __ pop(r0->bit(tosca_live) | obj.reg_bits() | pre_val->bit(true) | lr->bit(true), sp);
+
+  __ bind(done);
+}
+
+void G1BarrierSetAssembler::g1_write_barrier_post(MacroAssembler* masm,
+                                                  Address store_addr,
+                                                  Register new_val,
+                                                  Register thread,
+                                                  Register tmp,
+                                                  Register tmp2) {
+  assert(thread == rthread, "must be");
+
+  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
+  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
+
+  BarrierSet* bs = BarrierSet::barrier_set();
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+
+  Label done;
+  Label runtime;
+
+  // Does store cross heap regions?
+
+  __ lea(tmp2, store_addr);
+  __ eor(tmp, tmp2, new_val);
+  __ lsrs(tmp, tmp, HeapRegion::LogOfHRGrainBytes);
+  __ b(done, Assembler::EQ);
+
+  // crosses regions, storing NULL?
+
+  __ cbz(new_val, done);
+
+  // storing region crossing non-NULL, is card already dirty?
+
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+  const Register card_addr = tmp;
+
+  __ lsr(card_addr, tmp2, CardTable::card_shift);
+
+  //ExternalAddress cardtable((address) ct->byte_map_base());
+  __ mov(tmp2, (unsigned)ct->byte_map_base());
+
+  // get the address of the card
+  __ add(card_addr, card_addr, tmp2);
+  __ ldrb(tmp2, Address(card_addr));
+  __ cmp(tmp2, (int)G1CardTable::g1_young_card_val());
+  __ b(done, Assembler::EQ);
+
+  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
+
+  __ membar(Assembler::StoreLoad);
+
+  __ ldrb(tmp2, Address(card_addr));
+  __ cbz(tmp2, done);
+
+  // storing a region crossing, non-NULL oop, card is clean.
+  // dirty card and log.
+  __ mov(rscratch1, 0);
+  __ strb(rscratch1, Address(card_addr));
+
+  __ ldr(rscratch1, queue_index);
+  __ cbz(rscratch1, runtime);
+  __ sub(rscratch1, rscratch1, wordSize);
+  __ str(rscratch1, queue_index);
+
+  __ ldr(tmp2, buffer);
+  __ str(card_addr, Address(tmp2, rscratch1));
+  __ b(done);
+
+  __ bind(runtime);
+  // save the live input values
+  __ push(store_addr.reg_bits() | new_val->bit(true), sp);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
+  __ pop(store_addr.reg_bits() | new_val->bit(true), sp);
+
+  __ bind(done);
+}
+
+void G1BarrierSetAssembler::load_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                         Register dst, Address src, Register tmp1, Register tmp_thread) {
+  bool on_oop = type == T_OBJECT || type == T_ARRAY;
+  bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0;
+  bool on_phantom = (decorators & ON_PHANTOM_OOP_REF) != 0;
+  bool on_reference = on_weak || on_phantom;
+  ModRefBarrierSetAssembler::load_word_at(masm, decorators, type, dst, src, tmp1, tmp_thread);
+  if (on_oop && on_reference) {
+    // LR is live.  It must be saved around calls.
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+    g1_write_barrier_pre(masm /* masm */,
+                         Address() /* obj */,
+                         dst /* pre_val */,
+                         rthread /* thread */,
+                         tmp1 /* tmp */,
+                         true /* tosca_live */,
+                         true /* expand_call */);
+  }
+}
+
+void G1BarrierSetAssembler::load_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                        Address src, Register tmp1, Register tmp_thread) {
+  bool on_oop = type == T_OBJECT || type == T_ARRAY;
+  bool on_weak = (decorators & ON_WEAK_OOP_REF) != 0;
+  bool on_phantom = (decorators & ON_PHANTOM_OOP_REF) != 0;
+  bool on_reference = on_weak || on_phantom;
+  ModRefBarrierSetAssembler::load_tos_at(masm, decorators, type, src, tmp1, tmp_thread);
+  if (on_oop && on_reference) {
+    // Generate the G1 pre-barrier code to log the value of
+    // the referent field in an SATB buffer.
+    g1_write_barrier_pre(masm /* masm */,
+                         Address() /* obj */,
+                         r0 /* pre_val */, // atos is in r0
+                         rthread /* thread */,
+                         tmp1 /* tmp */,
+                         true /* tosca_live */,
+                         true /* expand_call */);
+  }
+}
+
+void G1BarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                         Address dst, Register val, Register tmp1, Register tmp2) {
+  g1_write_barrier_pre(masm,
+                       dst /* obj */,
+                       tmp2 /* pre_val */,
+                       rthread /* thread */,
+                       tmp1  /* tmp */,
+                       val != noreg /* tosca_live */,
+                       false /* expand_call */);
+
+  if (val == noreg) {
+    BarrierSetAssembler::store_word_at(masm, decorators, type, dst, noreg, tmp1, noreg);
+  } else {
+    BarrierSetAssembler::store_word_at(masm, decorators, type, dst, val, noreg, noreg);
+    g1_write_barrier_post(masm,
+                          dst /* store_adr */,
+                          val /* new_val */,
+                          rthread /* thread */,
+                          tmp1 /* tmp */,
+                          tmp2 /* tmp2 */);
+  }
+
+}
+
+#ifdef COMPILER1
+
+#undef __
+#define __ ce->masm()->
+
+void G1BarrierSetAssembler::gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub) {
+  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
+  // At this point we know that marking is in progress.
+  // If do_load() is true then we have to emit the
+  // load of the previous value; otherwise it has already
+  // been loaded into _pre_val.
+
+  __ bind(*stub->entry());
+
+  assert(stub->pre_val()->is_register(), "Precondition.");
+
+  Register pre_val_reg = stub->pre_val()->as_register();
+
+  if (stub->do_load()) {
+    ce->mem2reg(stub->addr(), stub->pre_val(), T_OBJECT, stub->patch_code(), stub->info(), false /*wide*/, false /*unaligned*/);
+  }
+  __ cbz(pre_val_reg, *stub->continuation());
+  ce->store_parameter(stub->pre_val()->as_register(), 0);
+  __ far_call(RuntimeAddress(bs->pre_barrier_c1_runtime_code_blob()->code_begin()));
+  __ b(*stub->continuation());
+}
+
+void G1BarrierSetAssembler::gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub) {
+  G1BarrierSetC1* bs = (G1BarrierSetC1*)BarrierSet::barrier_set()->barrier_set_c1();
+  __ bind(*stub->entry());
+  assert(stub->addr()->is_register(), "Precondition.");
+  assert(stub->new_val()->is_register(), "Precondition.");
+  Register new_val_reg = stub->new_val()->as_register();
+  __ cbz(new_val_reg, *stub->continuation());
+  ce->store_parameter(stub->addr()->as_pointer_register(), 0);
+  __ far_call(RuntimeAddress(bs->post_barrier_c1_runtime_code_blob()->code_begin()));
+  __ b(*stub->continuation());
+}
+
+#undef __
+
+#define __ sasm->
+
+void G1BarrierSetAssembler::generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm) {
+  __ prologue("g1_pre_barrier", false);
+
+  // arg0 : previous value of memory
+
+  BarrierSet* bs = BarrierSet::barrier_set();
+
+  const Register pre_val = r0;
+  const Register thread = rthread;
+  const Register tmp = rscratch1;
+
+  Address in_progress(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_active_offset()));
+  Address queue_index(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_index_offset()));
+  Address buffer(thread, in_bytes(G1ThreadLocalData::satb_mark_queue_buffer_offset()));
+
+  Label done;
+  Label runtime;
+
+  // Is marking still active?
+  if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
+    __ ldr(tmp, in_progress);
+  } else {
+    assert(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption");
+    __ ldrb(tmp, in_progress);
+  }
+  __ cbz(tmp, done);
+
+  // Can we store original value in the thread's buffer?
+  __ ldr(tmp, queue_index);
+  __ cbz(tmp, runtime);
+
+  __ sub(tmp, tmp, wordSize);
+  __ str(tmp, queue_index);
+  __ ldr(rscratch2, buffer);
+  __ add(tmp, tmp, rscratch2);
+  __ load_parameter(0, rscratch2);
+  __ str(rscratch2, Address(tmp, 0));
+  __ b(done);
+
+  __ bind(runtime);
+  __ push_call_clobbered_registers();
+  __ load_parameter(0, pre_val);
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_pre_entry), pre_val, thread);
+  __ pop_call_clobbered_registers();
+  __ bind(done);
+
+  __ epilogue();
+}
+
+void G1BarrierSetAssembler::generate_c1_post_barrier_runtime_stub(StubAssembler* sasm) {
+  __ prologue("g1_post_barrier", false);
+
+  // arg0: store_address
+  Address store_addr(rfp, 2*BytesPerWord);
+
+  BarrierSet* bs = BarrierSet::barrier_set();
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+
+  Label done;
+  Label runtime;
+
+  // At this point we know new_value is non-NULL and the new_value crosses regions.
+  // Must check to see if card is already dirty
+
+  const Register thread = rthread;
+
+  Address queue_index(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_index_offset()));
+  Address buffer(thread, in_bytes(G1ThreadLocalData::dirty_card_queue_buffer_offset()));
+
+  const Register card_addr = rscratch2;
+  ExternalAddress cardtable((address) ct->byte_map_base());
+
+  __ load_parameter(0, card_addr);
+  __ lsr(card_addr, card_addr, CardTable::card_shift);
+  __ mov(rscratch1, cardtable);
+  __ add(card_addr, card_addr, rscratch1);
+  __ ldrb(rscratch1, Address(card_addr));
+  __ cmp(rscratch1, (int)G1CardTable::g1_young_card_val());
+  __ b(done, Assembler::EQ);
+
+  assert((int)CardTable::dirty_card_val() == 0, "must be 0");
+
+  __ membar(Assembler::StoreLoad);
+  __ ldrb(rscratch1, Address(card_addr));
+  __ cbz(rscratch1, done);
+
+  // storing region crossing non-NULL, card is clean.
+  // dirty card and log.
+  __ mov(rscratch1, 0);
+  __ strb(rscratch1, Address(card_addr));
+
+  __ ldr(rscratch1, queue_index);
+  __ cbz(rscratch1, runtime);
+  __ sub(rscratch1, rscratch1, wordSize);
+  __ str(rscratch1, queue_index);
+
+  // Reuse LR to hold buffer_addr
+  const Register buffer_addr = lr;
+
+  __ ldr(buffer_addr, buffer);
+  __ str(card_addr, Address(buffer_addr, rscratch1));
+  __ b(done);
+
+  __ bind(runtime);
+  __ push_call_clobbered_registers();
+  __ call_VM_leaf(CAST_FROM_FN_PTR(address, G1BarrierSetRuntime::write_ref_field_post_entry), card_addr, thread);
+  __ pop_call_clobbered_registers();
+  __ bind(done);
+  __ epilogue();
+}
+
+#undef __
+
+#endif // COMPILER1
--- /dev/null	2018-09-25 19:24:49.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/g1/g1BarrierSetAssembler_aarch32.hpp	2018-09-25 19:24:49.000000000 +0300
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_GC_G1_G1BARRIERSETASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_GC_G1_G1BARRIERSETASSEMBLER_AARCH32_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "gc/shared/modRefBarrierSetAssembler.hpp"
+#include "utilities/macros.hpp"
+
+class LIR_Assembler;
+class StubAssembler;
+class G1PreBarrierStub;
+class G1PostBarrierStub;
+
+class G1BarrierSetAssembler: public ModRefBarrierSetAssembler {
+protected:
+  void gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                       Register addr, Register count);
+  void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                        Register start, Register end, Register tmp);
+
+  void g1_write_barrier_pre(MacroAssembler* masm,
+                            Address obj,
+                            Register pre_val,
+                            Register thread,
+                            Register tmp,
+                            bool tosca_live,
+                            bool expand_call);
+
+  void g1_write_barrier_post(MacroAssembler* masm,
+                             Address store_addr,
+                             Register new_val,
+                             Register thread,
+                             Register tmp,
+                             Register tmp2);
+
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address dst, Register val, Register tmp1, Register tmp2);
+
+public:
+#ifdef COMPILER1
+  void gen_pre_barrier_stub(LIR_Assembler* ce, G1PreBarrierStub* stub);
+  void gen_post_barrier_stub(LIR_Assembler* ce, G1PostBarrierStub* stub);
+
+  void generate_c1_pre_barrier_runtime_stub(StubAssembler* sasm);
+  void generate_c1_post_barrier_runtime_stub(StubAssembler* sasm);
+#endif
+
+  void load_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                    Register dst, Address src, Register tmp1, Register tmp_thread);
+  void load_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                   Address src, Register tmp1, Register tmp_thread);
+};
+
+#endif // CPU_AARCH32_GC_G1_G1BARRIERSETASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:50.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/barrierSetAssembler_aarch32.cpp	2018-09-25 19:24:50.000000000 +0300
@@ -0,0 +1,318 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "gc/shared/collectedHeap.hpp"
+#include "runtime/jniHandles.hpp"
+#include "runtime/thread.hpp"
+
+#define __ masm->
+
+void BarrierSetAssembler::load_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                       Register dst, Address src, Register tmp1, Register tmp_thread) {
+
+  // LR is live.  It must be saved around calls.
+
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_native = (decorators & IN_NATIVE) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    assert(in_heap || in_native, "why else?");
+    __ ldr(dst, src);
+    break;
+  }
+  case T_INT:     __ ldr(dst, src); break;
+  case T_ADDRESS: __ ldr(dst, src); break;
+  default: Unimplemented();
+  }
+}
+
+void BarrierSetAssembler::store_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                   Address dst, Register val, Register tmp1, Register tmp2) {
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_native = (decorators & IN_NATIVE) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    if (val == noreg) {
+      assert(tmp1 != noreg, "must provide valid register");
+      __ mov(tmp1, 0);
+      val = tmp1;
+    }
+    assert(in_heap || in_native, "why else?");
+    __ str(val, dst);
+    break;
+  }
+  case T_INT:     __ str(val, dst); break;
+  case T_ADDRESS: __ str(val, dst); break;
+  default: Unimplemented();
+  }
+}
+
+void BarrierSetAssembler::load_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                      Address src, Register tmp1, Register tmp_thread) {
+
+  // LR is live.  It must be saved around calls.
+
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_native = (decorators & IN_NATIVE) != 0;
+  bool is_not_null = (decorators & IS_NOT_NULL) != 0;
+  bool atomic  = (decorators & MO_SEQ_CST) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    assert(in_heap || in_native, "why else?");
+    __ ldr(r0, src);
+    break;
+  }
+  case T_BOOLEAN: __ load_unsigned_byte (r0, src);     break;
+  case T_BYTE:    __ load_signed_byte   (r0, src);     break;
+  case T_CHAR:    __ load_unsigned_short(r0, src);     break;
+  case T_SHORT:   __ load_signed_short  (r0, src);     break;
+  case T_DOUBLE:
+    if (hasFPU()) {
+      if (!src.is_safe_for(atomic ? Address::IDT_ATOMIC : Address::IDT_DOUBLE)) {
+        assert(tmp1 != noreg, "must be");
+        __ lea(tmp1, src);
+        src = Address(tmp1);
+      }
+      if (atomic) {
+        __ atomic_ldrd(r0, r1, src.base());
+        __ vmov_f64(d0, r0, r1);
+      } else {
+        __ vldr_f64(d0, src);
+      }
+      break;
+    }
+    // else fall-through
+  case T_LONG:
+    if (atomic) {
+      if (!src.is_safe_for(Address::IDT_ATOMIC)) {
+        assert(tmp1 != noreg, "must be");
+        __ lea(tmp1, src);
+        src = Address(tmp1);
+      }
+      __ atomic_ldrd(r0, r1, src.base());
+    } else {
+      __ ldrd(r0, r1, src);
+    }
+    break;
+  case T_FLOAT:
+    if (hasFPU()) {
+      if (!src.is_safe_for(Address::IDT_FLOAT)) {
+        assert(tmp1 != noreg, "must be");
+        __ lea(tmp1, src);
+        src = Address(tmp1);
+      }
+      __ vldr_f32(f0, src);
+      break;
+    }
+    // else fall-through
+  case T_ADDRESS:
+    // fall-through
+  case T_INT:     __ ldr                (r0, src);     break;
+  default: Unimplemented();
+  }
+}
+
+void BarrierSetAssembler::store_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                       Address dst, Register tmp1, Register tmp2) {
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool in_native = (decorators & IN_NATIVE) != 0;
+  bool atomic  = (decorators & MO_SEQ_CST) != 0;
+  switch (type) {
+  case T_OBJECT:
+  case T_ARRAY: {
+    assert(in_heap || in_native, "why else?");
+    __ str(r0, dst);
+    break;
+  }
+  case T_BOOLEAN:
+    __ andr(r0, r0, 0x1);  // boolean is true if LSB is 1
+    __ strb(r0, dst);
+    break;
+  case T_BYTE:    __ strb    (r0, dst);     break;
+  case T_CHAR:    __ strh    (r0, dst);     break;
+  case T_SHORT:   __ strh    (r0, dst);     break;
+  case T_FLOAT:
+    if (hasFPU()) {
+      if (!dst.is_safe_for(Address::IDT_FLOAT)) {
+        assert(tmp1 != noreg, "must be");
+        __ lea(tmp1, dst);
+        dst = Address(tmp1);
+      }
+      __ vstr_f32(d0, dst);
+      break;
+    }
+    // else fall-through
+  case T_INT:     __ str     (r0, dst);     break;
+  case T_DOUBLE:
+    if (hasFPU()) {
+      if (atomic) {
+        __ vmov_f64(r0, r1, d0);
+        // fall-through to T_LONG
+      } else {
+        if (!dst.is_safe_for(Address::IDT_DOUBLE)) {
+          assert(tmp1 != noreg, "must be");
+          __ lea(tmp1, dst);
+          dst = Address(tmp1);
+        }
+        __ vstr_f64(d0, dst);
+        break;
+      }
+    }
+    // else fall-through
+  case T_LONG:
+    if (atomic) {
+        assert(tmp1 != noreg && tmp2 != noreg, "must be");
+        assert_different_registers(rscratch1, tmp1, tmp2);
+        Register base;
+        if (!dst.is_safe_for(Address::IDT_ATOMIC) ||
+                dst.uses(tmp1) || dst.uses(tmp2)) {
+          __ lea(rscratch1, dst);
+          base = rscratch1;
+        } else {
+          base = dst.base(); // strexd only supports [base] addressing
+        }
+        __ atomic_strd(r0, r1, base, tmp1, tmp2);
+    } else {
+      __ strd(r0, r1, dst);
+    }
+    break;
+  case T_ADDRESS: __ str     (r0, dst);     break;
+  default: Unimplemented();
+  }
+}
+
+void BarrierSetAssembler::obj_equals(MacroAssembler* masm,
+                                     Register obj1, Register obj2) {
+  __ cmp(obj1, obj2);
+}
+
+void BarrierSetAssembler::try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
+                                                        Register obj, Register tmp, Label& slowpath) {
+  // If mask changes we need to ensure that the inverse is still encodable as an immediate
+  STATIC_ASSERT(JNIHandles::weak_tag_mask == 1);
+  __ bic(obj, obj, JNIHandles::weak_tag_mask);
+
+  __ ldr(obj, Address(obj, 0)); // *obj
+}
+
+// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
+void BarrierSetAssembler::tlab_allocate(MacroAssembler* masm, Register obj,
+                                        Register var_size_in_bytes,
+                                        int con_size_in_bytes,
+                                        Register t1,
+                                        Register t2,
+                                        Label& slow_case) {
+  assert_different_registers(obj, t2);
+  assert_different_registers(obj, var_size_in_bytes);
+  Register end = t2;
+
+  // verify_tlab();
+
+  __ ldr(obj, Address(rthread, JavaThread::tlab_top_offset()));
+  if (var_size_in_bytes == noreg) {
+    __ lea(end, Address(obj, con_size_in_bytes));
+  } else {
+    __ lea(end, Address(obj, var_size_in_bytes));
+  }
+  __ ldr(rscratch1, Address(rthread, JavaThread::tlab_end_offset()));
+  __ cmp(end, rscratch1);
+  __ b(slow_case, Assembler::HI);
+
+  // update the tlab top pointer
+  __ str(end, Address(rthread, JavaThread::tlab_top_offset()));
+
+  // recover var_size_in_bytes if necessary
+  if (var_size_in_bytes == end) {
+    __ sub(var_size_in_bytes, var_size_in_bytes, obj);
+  }
+  // verify_tlab();
+}
+
+// Defines obj, preserves var_size_in_bytes. uses rscratch1 and rscratch2
+void BarrierSetAssembler::eden_allocate(MacroAssembler* masm, Register obj,
+                                        Register var_size_in_bytes,
+                                        int con_size_in_bytes,
+                                        Register t1,
+                                        Label& slow_case) {
+  assert_different_registers(obj, var_size_in_bytes, t1);
+  if (!Universe::heap()->supports_inline_contig_alloc()) {
+    __ b(slow_case);
+  } else {
+    Register end = t1;
+    Register heap_end = rscratch2;
+    Label retry;
+    __ bind(retry);
+
+    __ mov(rscratch1, ExternalAddress((address) Universe::heap()->end_addr()));
+    __ ldr(heap_end, Address(rscratch1));
+
+    ExternalAddress heap_top((address) Universe::heap()->top_addr());
+    __ mov(rscratch1, heap_top);
+    __ ldrex(obj, rscratch1);
+
+    // Adjust it my the size of our new object
+    if (var_size_in_bytes == noreg) {
+      __ lea(end, Address(obj, con_size_in_bytes));
+    } else {
+      __ lea(end, Address(obj, var_size_in_bytes));
+    }
+
+    // if end < obj then we wrapped around high memory
+    __ cmp(end, obj);
+    __ b(slow_case, Assembler::LO);
+
+    __ cmp(end, heap_end);
+    __ b(slow_case, Assembler::HI);
+
+    // If heap_top hasn't been changed by some other thread, update it.
+    __ mov(rscratch2, rscratch1);
+    __ strex(rscratch1, end, rscratch2);
+    __ cmp(rscratch1, 0);
+    __ b(retry, Assembler::NE);
+
+    incr_allocated_bytes(masm, var_size_in_bytes, con_size_in_bytes, t1);
+  }
+}
+
+void BarrierSetAssembler::incr_allocated_bytes(MacroAssembler* masm,
+                                               Register var_size_in_bytes,
+                                               int con_size_in_bytes,
+                                               Register t1) {
+  assert(t1->is_valid(), "need temp reg");
+
+  __ ldr(t1, Address(rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+  if (var_size_in_bytes->is_valid()) {
+    __ add(t1, t1, var_size_in_bytes);
+  } else {
+    __ add(t1, t1, con_size_in_bytes);
+  }
+  __ str(t1, Address(rthread, in_bytes(JavaThread::allocated_bytes_offset())));
+}
--- /dev/null	2018-09-25 19:24:51.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/barrierSetAssembler_aarch32.hpp	2018-09-25 19:24:51.000000000 +0300
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_GC_SHARED_BARRIERSETASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_GC_SHARED_BARRIERSETASSEMBLER_AARCH32_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "memory/allocation.hpp"
+#include "oops/access.hpp"
+
+class BarrierSetAssembler: public CHeapObj<mtGC> {
+private:
+  void incr_allocated_bytes(MacroAssembler* masm,
+                            Register var_size_in_bytes, int con_size_in_bytes,
+                            Register t1 = noreg);
+
+public:
+  virtual void arraycopy_prologue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                  Register addr, Register count) {}
+  virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                  Register start, Register end, Register tmp) {}
+  virtual void load_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Register dst, Address src, Register tmp1, Register tmp_thread);
+  virtual void store_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                             Address dst, Register val, Register tmp1, Register tmp2);
+  virtual void load_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                           Address src, Register tmp1, Register tmp_thread);
+  virtual void store_tos_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address dst, Register tmp1, Register tmp2);
+
+  virtual void obj_equals(MacroAssembler* masm,
+                          Register obj1, Register obj2);
+
+  virtual void try_resolve_jobject_in_native(MacroAssembler* masm, Register jni_env,
+                                             Register obj, Register tmp, Label& slowpath);
+
+  virtual void tlab_allocate(MacroAssembler* masm,
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+
+  void eden_allocate(MacroAssembler* masm,
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+  virtual void barrier_stubs_init() {}
+};
+
+#endif // CPU_AARCH32_GC_SHARED_BARRIERSETASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:52.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/cardTableBarrierSetAssembler_aarch32.cpp	2018-09-25 19:24:52.000000000 +0300
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/cardTable.hpp"
+#include "gc/shared/cardTableBarrierSet.hpp"
+#include "gc/shared/cardTableBarrierSetAssembler.hpp"
+#include "interpreter/interp_masm.hpp"
+
+#define __ masm->
+
+void CardTableBarrierSetAssembler::store_check(MacroAssembler* masm, Register obj, Address dst) {
+
+  // Does a store check for the oop in register obj. The content of
+  // register obj is destroyed afterwards.
+
+  BarrierSet* bs = BarrierSet::barrier_set();
+  assert(bs->kind() == BarrierSet::CardTableBarrierSet,
+         "Wrong barrier set kind");
+
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+
+  assert(CardTable::dirty_card_val() == 0, "must be");
+
+  jbyte *byte_map_base =
+    ((CardTableBarrierSet*)(BarrierSet::barrier_set()))->card_table()->byte_map_base();
+  __ mov(rscratch1, (uint32_t)byte_map_base);
+  assert((p2i(byte_map_base) & 0xff) == 0, "fix store char 0 below");
+
+  if (UseCondCardMark) {
+    __ membar(Assembler::StoreLoad);
+    __ ldrb(rscratch2,  Address(rscratch1, obj, lsr((int) CardTable::card_shift)));
+    __ cmp(rscratch2, 0);
+    __ strb(rscratch1, Address(rscratch1, obj, lsr((int) CardTable::card_shift)), Assembler::NE);
+  } else {
+    if (ct->scanned_concurrently()) {
+      __ membar(Assembler::StoreStore);
+    }
+    __ strb(rscratch1, Address(rscratch1, obj, lsr((int) CardTable::card_shift)));
+  }
+}
+
+void CardTableBarrierSetAssembler::gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                                                    Register start, Register end, Register scratch) {
+  BarrierSet* bs = BarrierSet::barrier_set();
+  CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs);
+  CardTable* ct = ctbs->card_table();
+  assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
+
+  Label L_loop;
+
+  __ lsr(start, start, CardTable::card_shift);
+  __ lsr(end, end, CardTable::card_shift);
+  __ sub(end, end, start); // number of bytes to copy
+
+  const Register count = end; // 'end' register contains bytes count now
+  __ mov(scratch, (address)ct->byte_map_base());
+  __ add(start, start, scratch);
+  if (ct->scanned_concurrently()) {
+    __ membar(__ StoreStore);
+  }
+  __ bind(L_loop);
+  __ mov(scratch, 0);
+  __ strb(scratch, Address(start, count));
+  __ subs(count, count, 1);
+  __ b(L_loop, Assembler::HS);
+}
+
+void CardTableBarrierSetAssembler::oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                                Address dst, Register val, Register tmp1, Register tmp2) {
+  bool in_heap = (decorators & IN_HEAP) != 0;
+  bool is_array = (decorators & IS_ARRAY) != 0;
+  bool on_anonymous = (decorators & ON_UNKNOWN_OOP_REF) != 0;
+  bool precise = is_array || on_anonymous;
+
+  bool needs_post_barrier = val != noreg && in_heap;
+  BarrierSetAssembler::store_word_at(masm, decorators, type, dst, val, tmp1, noreg);
+  if (needs_post_barrier) {
+    // flatten object address if needed
+    if (!precise || (dst.index() == noreg && dst.offset() == 0)) {
+      store_check(masm, dst.base(), dst);
+    } else {
+      __ lea(tmp1, dst);
+      store_check(masm, tmp1, dst);
+    }
+  }
+}
--- /dev/null	2018-09-25 19:24:54.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/cardTableBarrierSetAssembler_aarch32.hpp	2018-09-25 19:24:53.000000000 +0300
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_GC_SHARED_CARDTABLEBARRIERSETASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_GC_SHARED_CARDTABLEBARRIERSETASSEMBLER_AARCH32_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "gc/shared/modRefBarrierSetAssembler.hpp"
+
+class CardTableBarrierSetAssembler: public ModRefBarrierSetAssembler {
+protected:
+  void store_check(MacroAssembler* masm, Register obj, Address dst);
+
+  virtual void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                                Register start, Register end, Register tmp);
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address dst, Register val, Register tmp1, Register tmp2);
+
+};
+
+#endif // #ifndef CPU_AARCH32_GC_SHARED_CARDTABLEBARRIERSETASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:55.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/modRefBarrierSetAssembler_aarch32.cpp	2018-09-25 19:24:54.000000000 +0300
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/modRefBarrierSetAssembler.hpp"
+
+#define __ masm->
+
+void ModRefBarrierSetAssembler::arraycopy_prologue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                                   Register addr, Register count) {
+
+  if (is_oop) {
+    gen_write_ref_array_pre_barrier(masm, decorators, addr, count);
+  }
+}
+
+void ModRefBarrierSetAssembler::arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                                   Register start, Register end, Register scratch) {
+  if (is_oop) {
+    gen_write_ref_array_post_barrier(masm, decorators, start, end, scratch);
+  }
+}
+
+void ModRefBarrierSetAssembler::store_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                                         Address dst, Register val, Register tmp1, Register tmp2) {
+  if (type == T_OBJECT || type == T_ARRAY) {
+    oop_store_at(masm, decorators, type, dst, val, tmp1, tmp2);
+  } else {
+    BarrierSetAssembler::store_word_at(masm, decorators, type, dst, val, tmp1, tmp2);
+  }
+}
--- /dev/null	2018-09-25 19:24:56.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/gc/shared/modRefBarrierSetAssembler_aarch32.hpp	2018-09-25 19:24:55.000000000 +0300
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_GC_SHARED_MODREFBARRIERSETASSEMBLER_AARCH64_HPP
+#define CPU_AARCH64_GC_SHARED_MODREFBARRIERSETASSEMBLER_AARCH64_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+
+// The ModRefBarrierSetAssembler filters away accesses on BasicTypes other
+// than T_OBJECT/T_ARRAY (oops). The oop accesses call one of the protected
+// accesses, which are overridden in the concrete BarrierSetAssembler.
+
+class ModRefBarrierSetAssembler: public BarrierSetAssembler {
+protected:
+  // Generate code for an array write pre barrier
+  //
+  //     addr    -  starting address
+  //     count   -  element count
+  //
+  //     Destroy no registers!
+  //
+  virtual void gen_write_ref_array_pre_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                               Register addr, Register count) {}
+
+  // Generate code for an array write post barrier
+  //
+  //  Input:
+  //     start    - register containing starting address of destination array
+  //     end      - register containing ending address of destination array
+  //     scratch  - scratch register
+  //
+  //  The input registers are overwritten.
+  //  The ending address is inclusive.
+  virtual void gen_write_ref_array_post_barrier(MacroAssembler* masm, DecoratorSet decorators,
+                                                Register start, Register end, Register scratch) {}
+
+  virtual void oop_store_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                            Address dst, Register val, Register tmp1, Register tmp2) = 0;
+
+public:
+  virtual void arraycopy_prologue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                  Register addr, Register count);
+  virtual void arraycopy_epilogue(MacroAssembler* masm, DecoratorSet decorators, bool is_oop,
+                                  Register start, Register end, Register scratch);
+  virtual void store_word_at(MacroAssembler* masm, DecoratorSet decorators, BasicType type,
+                             Address dst, Register val, Register tmp1, Register tmp2);
+};
+
+#endif // CPU_AARCH64_GC_SHARED_MODREFBARRIERSETASSEMBLER_AARCH64_HPP
--- /dev/null	2018-09-25 19:24:57.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/globalDefinitions_aarch32.hpp	2018-09-25 19:24:56.000000000 +0300
@@ -0,0 +1,72 @@
+/*
+ * Copyright (c) 1999, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_GLOBALDEFINITIONS_AARCH32_HPP
+#define CPU_AARCH32_VM_GLOBALDEFINITIONS_AARCH32_HPP
+
+// __ARM_PCS_VFP indicates that gcc runs with "-mfloat-abi=hard" option.
+// This option allows generation of floating point instructions and enforces
+// usage of FPU-specific calling conventions.
+#ifdef __ARM_PCS_VFP
+#define HARD_FLOAT_CC
+#endif // __ARM_PCS_VFP
+
+// If changing this please be sure to review all code which saves the registers
+// and the corresponding register maps to ensure that the respective frame
+// sizes are multiple of this new value
+const int StackAlignmentInBytes = 8;
+
+// Indicates whether the C calling conventions require that
+// 32-bit integer argument values are properly extended to 64 bits.
+const bool CCallingConventionRequiresIntsAsLongs = false;
+
+#define SUPPORTS_NATIVE_CX8
+
+// The maximum B/BL offset range on AArch32 is 32MB.
+#undef CODE_CACHE_DEFAULT_LIMIT
+#define CODE_CACHE_DEFAULT_LIMIT (32*M)
+
+// According to the ARMv8 ARM, "Concurrent modification and execution
+// of instructions can lead to the resulting instruction performing
+// any behavior that can be achieved by executing any sequence of
+// instructions that can be executed from the same Exception level,
+// except where the instruction before modification and the
+// instruction after modification is a B, BL, NOP, BKPT, SVC, HVC, or
+// SMC instruction."
+//
+// This makes the games we play when patching difficult, so when we
+// come across an access that needs patching we deoptimize.  There are
+// ways we can avoid this, but these would slow down C1-compiled code
+// in the default case.  We could revisit this decision if we get any
+// evidence that it's worth doing.
+#define DEOPTIMIZE_WHEN_PATCHING
+
+#define SUPPORT_RESERVED_STACK_AREA
+
+#define THREAD_LOCAL_POLL
+
+#endif // CPU_AARCH32_VM_GLOBALDEFINITIONS_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:58.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/globals_aarch32.hpp	2018-09-25 19:24:57.000000000 +0300
@@ -0,0 +1,117 @@
+/*
+ * Copyright (c) 2000, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_GLOBALS_AARCH32_HPP
+#define CPU_AARCH32_VM_GLOBALS_AARCH32_HPP
+
+#include "utilities/globalDefinitions.hpp"
+#include "utilities/macros.hpp"
+
+// Sets the default values for platform dependent flags used by the runtime system.
+// (see globals.hpp)
+
+define_pd_global(bool, ShareVtableStubs,         true);
+define_pd_global(bool, NeedsDeoptSuspend,       false); // only register window machines need this
+
+define_pd_global(bool, ImplicitNullChecks,       true);  // Generate code for implicit null checks
+define_pd_global(bool, TrapBasedNullChecks,     false);
+define_pd_global(bool, UncommonNullCast,         true);  // Uncommon-trap NULLs past to check cast
+
+//TODO: update if 32 bit platforms need different sizes
+define_pd_global(uintx, CodeCacheSegmentSize,    64 TIERED_ONLY(+64)); // Tiered compilation has large code-entry alignment.
+define_pd_global(intx, CodeEntryAlignment,       32);
+define_pd_global(intx, OptoLoopAlignment,        16);
+define_pd_global(intx, InlineFrequencyCount,     100);
+
+#define DEFAULT_STACK_YELLOW_PAGES (2)
+#define DEFAULT_STACK_RED_PAGES (1)
+#define DEFAULT_STACK_SHADOW_PAGES (5 DEBUG_ONLY(+1))
+#define DEFAULT_STACK_RESERVED_PAGES (1)
+
+#define MIN_STACK_YELLOW_PAGES DEFAULT_STACK_YELLOW_PAGES
+#define MIN_STACK_RED_PAGES    DEFAULT_STACK_RED_PAGES
+#define MIN_STACK_SHADOW_PAGES DEFAULT_STACK_SHADOW_PAGES
+#define MIN_STACK_RESERVED_PAGES (0)
+
+define_pd_global(intx, StackYellowPages, DEFAULT_STACK_YELLOW_PAGES);
+define_pd_global(intx, StackRedPages, DEFAULT_STACK_RED_PAGES);
+define_pd_global(intx, StackShadowPages, DEFAULT_STACK_SHADOW_PAGES);
+define_pd_global(intx, StackReservedPages, DEFAULT_STACK_RESERVED_PAGES);
+
+define_pd_global(bool, RewriteBytecodes,     true);
+define_pd_global(bool, RewriteFrequentPairs, true);
+
+define_pd_global(bool, PreserveFramePointer, false);
+
+// GC Ergo Flags
+define_pd_global(uintx, CMSYoungGenPerWorker, 64*M);  // default max size of CMS young gen, per GC worker thread
+
+define_pd_global(uintx, TypeProfileLevel, 111);
+
+define_pd_global(bool, CompactStrings, true);
+define_pd_global(intx, InitArrayShortSize, BytesPerLong);
+
+define_pd_global(bool, ThreadLocalHandshakes, false/*true*/);
+
+// FIXME this turned out to be needed for the core build too?
+//#if defined(COMPILER1) || defined(COMPILER2)
+define_pd_global(intx, InlineSmallCode,          1000);
+//#endif
+
+// Define it instead providing as option, inlining the constant significantly
+// improves perfromance. The option is disabled for AARCH32 in globals.hpp too.
+#define UseMembar true
+
+#define ARCH_FLAGS(develop, \
+                   product, \
+                   diagnostic, \
+                   experimental, \
+                   notproduct, \
+                   range, \
+                   constraint, \
+                   writeable) \
+                                                                                                        \
+  product(bool, NearCpool, true,                                                                        \
+         "constant pool is close to instructions")                                                      \
+                                                                                                        \
+  product(bool, UseBarriersForVolatile, false,                                                          \
+          "Use memory barriers to implement volatile accesses")                                         \
+  product(bool, TraceTraps, false, "Trace all traps the signal handler")                                \
+  product(bool, UseSIMDForMemoryOps, false,                                                             \
+      "Use SIMD instructions in generated memory move code")                                            \
+  product(bool, UseNeon, false,                                                                         \
+          "Use Neon for CRC32 computation")                                                             \
+  product(bool, UseCRC32, false,                                                                        \
+          "Use CRC32 instructions for CRC32 computation")                                               \
+  product(bool, JNIFrameAPCS, false, "Assume APCS frame layout for JNI")                                \
+  product(bool, FrameAPCS, false, "Use APCS frame layout")                                              \
+  product(bool, VMFrameAPCS, false, "Force APCS frame layout for VM. "                                  \
+      "Usually you don't need to set his flag, VM layout is autodetected")                              \
+  product(bool, UseFPU, true, "Enable FPU utilization at floating point ops."                           \
+    "Affects SoftFP mode only.")
+
+#endif // CPU_AARCH32_VM_GLOBALS_AARCH32_HPP
--- /dev/null	2018-09-25 19:24:59.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/icBuffer_aarch32.cpp	2018-09-25 19:24:59.000000000 +0300
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/icBuffer.hpp"
+#include "gc/shared/collectedHeap.inline.hpp"
+#include "interpreter/bytecodes.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/oop.inline.hpp"
+
+int InlineCacheBuffer::ic_stub_code_size() {
+  return     /* ldr */ NativeInstruction::arm_insn_sz +
+      /* far_branch */ MacroAssembler::far_branch_size() +
+      /* emit_int32 */ NativeInstruction::arm_insn_sz;
+}
+
+#define __ masm->
+
+void InlineCacheBuffer::assemble_ic_buffer_code(address code_begin, void* cached_value, address entry_point) {
+  ResourceMark rm;
+  CodeBuffer      code(code_begin, ic_stub_code_size());
+  MacroAssembler* masm            = new MacroAssembler(&code);
+  // note: even though the code contains an embedded value, we do not need reloc info
+  // because
+  // (1) the value is old (i.e., doesn't matter for scavenges)
+  // (2) these ICStubs are removed *before* a GC happens, so the roots disappear
+  // assert(cached_value == NULL || cached_oop->is_perm(), "must be perm oop");
+
+  address start = __ pc();
+  Label l;
+  __ ldr(rscratch2, l);
+  __ far_jump(ExternalAddress(entry_point));
+  __ bind(l);
+  __ emit_int32((int32_t)cached_value);
+  // Only need to invalidate the 1st two instructions - not the whole ic stub
+  ICache::invalidate_range(code_begin, InlineCacheBuffer::ic_stub_code_size());
+  assert(__ pc() - start == ic_stub_code_size(), "must be");
+}
+
+address InlineCacheBuffer::ic_buffer_entry_point(address code_begin) {
+  NativeMovConstReg* move = nativeMovConstReg_at(code_begin);   // creation also verifies the object
+  NativeJump* jump = nativeJump_at(code_begin + 4);
+  return jump->jump_destination();
+}
+
+
+void* InlineCacheBuffer::ic_buffer_cached_value(address code_begin) {
+  // The word containing the cached value is at the end of this IC buffer
+  uintptr_t *p = (uintptr_t *)(code_begin + ic_stub_code_size() - wordSize);
+  void* o = (void*)*p;
+  return o;
+}
--- /dev/null	2018-09-25 19:25:00.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/icache_aarch32.cpp	2018-09-25 19:25:00.000000000 +0300
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "runtime/icache.hpp"
+
+void ICacheStubGenerator::generate_icache_flush(
+                ICache::flush_icache_stub_t* flush_icache_stub) {
+  // Give anyone who calls this a surprise
+  *flush_icache_stub = (ICache::flush_icache_stub_t)NULL;
+}
+
+void ICache::initialize() {
+}
--- /dev/null	2018-09-25 19:25:01.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/icache_aarch32.hpp	2018-09-25 19:25:01.000000000 +0300
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_ICACHE_AARCH32_HPP
+#define CPU_AARCH32_VM_ICACHE_AARCH32_HPP
+
+// Interface for updating the instruction cache.  Whenever the VM
+// modifies code, part of the processor instruction cache potentially
+// has to be flushed.
+
+class ICache : public AbstractICache {
+ public:
+  static void initialize();
+  static void invalidate_word(address addr) {
+    __clear_cache((char *)addr, (char *)(addr + 3));
+  }
+  static void invalidate_range(address start, int nbytes) {
+    __clear_cache((char *)start, (char *)(start + nbytes));
+  }
+};
+
+#endif // CPU_AARCH32_VM_ICACHE_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:02.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/interp_masm_aarch32.cpp	2018-09-25 19:25:02.000000000 +0300
@@ -0,0 +1,1941 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "interp_masm_aarch32.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "logging/log.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/markOop.hpp"
+#include "oops/method.hpp"
+#include "oops/methodData.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/basicLock.hpp"
+#include "runtime/biasedLocking.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/safepointMechanism.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/thread.inline.hpp"
+
+#include "vm_version_aarch32.hpp"
+#include "register_aarch32.hpp"
+
+
+// Implementation of InterpreterMacroAssembler
+
+void InterpreterMacroAssembler::narrow(Register result) {
+  // Get method->_constMethod->_result_type
+  ldr(rscratch1, Address(rfp, frame::get_interpreter_frame_method_offset() * wordSize));
+  ldr(rscratch1, Address(rscratch1, Method::const_offset()));
+  ldrb(rscratch1, Address(rscratch1, ConstMethod::result_type_offset()));
+
+  Label done;
+
+  // common case first
+
+  cmp(rscratch1, T_INT);
+  b(done, Assembler::EQ);
+
+  // mask integer result to narrower return type.
+  cmp(rscratch1, T_BOOLEAN);
+  andr(result, result, 0x1, Assembler::EQ);
+
+  cmp(rscratch1, T_BYTE);
+  sxtb(result, result, Assembler::ror(), Assembler::EQ);
+
+  cmp(rscratch1, T_CHAR);
+  uxth(result, result, Assembler::ror(), Assembler::EQ);  // truncate upper 16 bits
+
+  sxth(result, result, Assembler::ror(), Assembler::NE);  // sign-extend short
+
+  // Nothing to do for T_INT
+  bind(done);
+}
+
+void InterpreterMacroAssembler::jump_to_entry(address entry) {
+  assert(entry, "Entry must have been generated by now");
+  b(entry);
+}
+
+void InterpreterMacroAssembler::check_and_handle_popframe(Register java_thread) {
+  if (JvmtiExport::can_pop_frame()) {
+    Label L;
+    // Initiate popframe handling only if it is not already being
+    // processed.  If the flag has the popframe_processing bit set, it
+    // means that this code is called *during* popframe handling - we
+    // don't want to reenter.
+    // This method is only called just after the call into the vm in
+    // call_VM_base, so the arg registers are available.
+    ldr(rscratch1, Address(rthread, JavaThread::popframe_condition_offset()));
+    tst(rscratch1, JavaThread::popframe_pending_bit);
+    b(L, Assembler::EQ);
+    tst(rscratch1, JavaThread::popframe_processing_bit);
+    b(L, Assembler::NE);
+    // Call Interpreter::remove_activation_preserving_args_entry() to get the
+    // address of the same-named entrypoint in the generated interpreter code.
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_preserving_args_entry));
+    b(r0);
+    bind(L);
+  }
+}
+
+
+void InterpreterMacroAssembler::load_earlyret_value(TosState state) {
+  ldr(r2, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+  const Address tos_addr(r2, JvmtiThreadState::earlyret_tos_offset());
+  const Address oop_addr(r2, JvmtiThreadState::earlyret_oop_offset());
+  const Address val_addr(r2, JvmtiThreadState::earlyret_value_offset());
+  switch (state) {
+    case atos: ldr(r0, oop_addr);
+               mov(rscratch1, 0);
+               str(rscratch1, oop_addr);
+               verify_oop(r0, state);               break;
+    case dtos:
+        if(hasFPU()) {
+            vldr_f64(d0, val_addr);              break;
+        }//fall through otherwise
+    case ltos: ldrd(r0, val_addr);                  break;
+    case ftos:
+        if(hasFPU()) {
+            vldr_f32(d0, val_addr);              break;
+        } //fall through otherwise
+    case btos:                                   // fall through
+    case ztos:                                   // fall through
+    case ctos:                                   // fall through
+    case stos:                                   // fall through
+    case itos: ldr(r0, val_addr);                   break;
+    case vtos: /* nothing to do */                  break;
+    default  : ShouldNotReachHere();
+  }
+  // Clean up tos value in the thread object
+  mov(rscratch1, (int) ilgl);
+  str(rscratch1, tos_addr);
+  mov(rscratch1, 0);
+  str(rscratch1, val_addr);
+}
+
+
+void InterpreterMacroAssembler::check_and_handle_earlyret(Register java_thread) {
+  if (JvmtiExport::can_force_early_return()) {
+    Label L;
+    ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+    cbz(rscratch1, L); // if (thread->jvmti_thread_state() == NULL) exit;
+
+    // Initiate earlyret handling only if it is not already being processed.
+    // If the flag has the earlyret_processing bit set, it means that this code
+    // is called *during* earlyret handling - we don't want to reenter.
+    ldr(rscratch1, Address(rscratch1, JvmtiThreadState::earlyret_state_offset()));
+    cmp(rscratch1, JvmtiThreadState::earlyret_pending);
+    b(L, Assembler::NE);
+
+    // Call Interpreter::remove_activation_early_entry() to get the address of the
+    // same-named entrypoint in the generated interpreter code.
+    ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+    ldr(rscratch1, Address(rscratch1, JvmtiThreadState::earlyret_tos_offset()));
+    call_VM_leaf(CAST_FROM_FN_PTR(address, Interpreter::remove_activation_early_entry), rscratch1);
+    b(r0);
+    bind(L);
+  }
+}
+
+void InterpreterMacroAssembler::get_unsigned_2_byte_index_at_bcp(
+  Register reg,
+  int bcp_offset) {
+  assert(bcp_offset >= 0, "bcp is still pointing to start of bytecode");
+  ldrh(reg, Address(rbcp, bcp_offset));
+  rev16(reg, reg);
+}
+
+void InterpreterMacroAssembler::get_dispatch() {
+  mov(rdispatch, ExternalAddress((address)Interpreter::dispatch_table()));
+}
+
+void InterpreterMacroAssembler::get_cache_index_at_bcp(Register index,
+                                                       int bcp_offset,
+                                                       size_t index_size) {
+  assert(bcp_offset > 0, "bcp is still pointing to start of bytecode");
+  if (index_size == sizeof(u2)) {
+    load_unsigned_short(index, Address(rbcp, bcp_offset));
+  } else if (index_size == sizeof(u4)) {
+    // assert(EnableInvokeDynamic, "giant index used only for JSR 292");
+    ldr(index, Address(rbcp, bcp_offset));
+    // Check if the secondary index definition is still ~x, otherwise
+    // we have to change the following assembler code to calculate the
+    // plain index.
+    assert(ConstantPool::decode_invokedynamic_index(~123) == 123, "else change next line");
+    inv(index, index);  // convert to plain index
+  } else if (index_size == sizeof(u1)) {
+    load_unsigned_byte(index, Address(rbcp, bcp_offset));
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+// Return
+// Rindex: index into constant pool
+// Rcache: address of cache entry - ConstantPoolCache::base_offset()
+//
+// A caller must add ConstantPoolCache::base_offset() to Rcache to get
+// the true address of the cache entry.
+//
+void InterpreterMacroAssembler::get_cache_and_index_at_bcp(Register cache,
+                                                           Register index,
+                                                           int bcp_offset,
+                                                           size_t index_size) {
+  assert_different_registers(cache, index);
+  assert_different_registers(cache, rcpool);
+  get_cache_index_at_bcp(index, bcp_offset, index_size);
+  assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
+  // convert from field index to ConstantPoolCacheEntry
+  // aarch32 already has the cache in rcpool so there is no need to
+  // install it in cache. instead we pre-add the indexed offset to
+  // rcpool and return it in cache. All clients of this method need to
+  // be modified accordingly.
+  add(cache, rcpool, index, lsl( exact_log2(4) + exact_log2(wordSize)));
+}
+
+
+void InterpreterMacroAssembler::get_cache_and_index_and_bytecode_at_bcp(Register cache,
+                                                                        Register index,
+                                                                        Register bytecode,
+                                                                        int byte_no,
+                                                                        int bcp_offset,
+                                                                        size_t index_size) {
+  get_cache_and_index_at_bcp(cache, index, bcp_offset, index_size);
+  // We use a 32-bit load here since the layout of 64-bit words on
+  // little-endian machines allow us that.
+  // n.b. unlike x86 cache already includes the index offset
+  ldr(bytecode, Address(cache,
+                        ConstantPoolCache::base_offset()
+                        + ConstantPoolCacheEntry::indices_offset()));
+  const int shift_count = (1 + byte_no) * BitsPerByte;
+  //ubfx(bytecode, bytecode, shift_count, BitsPerByte);
+  assert(shift_count >= 0 && shift_count <= 24 && 0 == (shift_count & 7), "Invalid shift count");
+  uxtb(bytecode, bytecode, ror(shift_count));
+}
+
+void InterpreterMacroAssembler::get_cache_entry_pointer_at_bcp(Register cache,
+                                                               Register tmp,
+                                                               int bcp_offset,
+                                                               size_t index_size) {
+  assert(cache != tmp, "must use different register");
+  get_cache_index_at_bcp(tmp, bcp_offset, index_size);
+  assert(sizeof(ConstantPoolCacheEntry) == 4 * wordSize, "adjust code below");
+  // convert from field index to ConstantPoolCacheEntry index
+  // and from word offset to byte offset
+  assert(exact_log2(in_bytes(ConstantPoolCacheEntry::size_in_bytes())) == 2 + LogBytesPerWord, "else change next line");
+  ldr(cache, Address(rfp, frame::get_interpreter_frame_cache_offset() * wordSize));
+  // skip past the header
+  add(cache, cache, in_bytes(ConstantPoolCache::base_offset()));
+  add(cache, cache, tmp, lsl(2 + LogBytesPerWord));  // construct pointer to cache entry
+}
+
+void InterpreterMacroAssembler::get_method_counters(Register method,
+                                                    Register mcs, Label& skip) {
+  Label has_counters;
+  ldr(mcs, Address(method, Method::method_counters_offset()));
+  cbnz(mcs, has_counters);
+  call_VM(noreg, CAST_FROM_FN_PTR(address,
+          InterpreterRuntime::build_method_counters), method);
+  ldr(mcs, Address(method, Method::method_counters_offset()));
+  cbz(mcs, skip); // No MethodCounters allocated, OutOfMemory
+  bind(has_counters);
+}
+
+// Load object from cpool->resolved_references(index)
+void InterpreterMacroAssembler::load_resolved_reference_at_index(
+                                           Register result, Register index, Register tmp) {
+  assert_different_registers(result, index);
+  // convert from field index to resolved_references() index and from
+  // word index to byte offset. Since this is a java object, it can be compressed
+
+  get_constant_pool(result);
+  // load pointer for resolved_references[] objArray
+  ldr(result, Address(result, ConstantPool::cache_offset_in_bytes()));
+  ldr(result, Address(result, ConstantPoolCache::resolved_references_offset_in_bytes()));
+  resolve_oop_handle(result, tmp);
+  // Add in the index
+  add(result, result, index, lsl(LogBytesPerHeapOop));
+  load_heap_oop(result, Address(result, arrayOopDesc::base_offset_in_bytes(T_OBJECT)));
+}
+
+void InterpreterMacroAssembler::load_resolved_klass_at_offset(
+                             Register cpool, Register index, Register klass, Register temp) {
+  add(temp, cpool, index, lsl(LogBytesPerWord));
+  ldrh(temp, Address(temp, sizeof(ConstantPool))); // temp = resolved_klass_index
+  ldr(klass, Address(cpool,  ConstantPool::resolved_klasses_offset_in_bytes())); // klass = cpool->_resolved_klasses
+  add(klass, klass, temp, lsl(LogBytesPerWord));
+  ldr(klass, Address(klass, Array<Klass*>::base_offset_in_bytes()));
+}
+
+
+// Generate a subtype check: branch to ok_is_subtype if sub_klass is a
+// subtype of super_klass.
+//
+// Args:
+//      r0: superklass
+//      Rsub_klass: subklass
+//
+// Kills:
+//      r2, r5
+void InterpreterMacroAssembler::gen_subtype_check(Register Rsub_klass,
+                                                  Label& ok_is_subtype) {
+  assert(Rsub_klass != r0, "r0 holds superklass");
+  assert(Rsub_klass != r2, "r2 holds 2ndary super array length");
+  assert(Rsub_klass != r14, "r14 holds 2ndary super array scan ptr");
+
+  // Profile the not-null value's klass.
+  profile_typecheck(r2, Rsub_klass, r14); // blows r2
+
+  // Do the check.
+  check_klass_subtype(Rsub_klass, r0, r2, ok_is_subtype); // blows r2
+
+  // Profile the failure of the check.
+  profile_typecheck_failed(r2); // blows r2
+}
+
+// Java Expression Stack
+
+void InterpreterMacroAssembler::pop_ptr(Register r) {
+  ldr(r, post(sp, wordSize));
+}
+
+void InterpreterMacroAssembler::pop_i(Register r) {
+  ldr(r, post(sp, wordSize));
+}
+
+void InterpreterMacroAssembler::pop_l(Register rLo, Register rHi) {
+    assert(rHi->encoding() == rLo->encoding() + 1, "must use two consecutive registers");
+    ldrd(rLo, post(sp, 2 * Interpreter::stackElementSize));
+}
+
+void InterpreterMacroAssembler::push_ptr(Register r) {
+  str(r, pre(sp, -wordSize));
+}
+
+void InterpreterMacroAssembler::push_i(Register r) {
+  str(r, pre(sp, -wordSize));
+}
+
+void InterpreterMacroAssembler::push_l(Register rLo, Register rHi) {
+    assert(r2->encoding() == r1->encoding() + 1, "must use two consecutive registers");
+    strd(rLo, pre(sp, -2 * wordSize));
+}
+
+void InterpreterMacroAssembler::pop_f(FloatRegister r) {
+  vldmia_f32(sp, FloatRegSet(r).bits());
+}
+
+void InterpreterMacroAssembler::pop_d(FloatRegister r) {
+  assert(is_even(r->encoding()), "not double!");
+  vldmia_f64(sp, DoubleFloatRegSet(r).bits());
+}
+
+void InterpreterMacroAssembler::push_f(FloatRegister r) {
+  vstmdb_f32(sp, FloatRegSet(r).bits());
+}
+
+void InterpreterMacroAssembler::push_d(FloatRegister r) {
+  assert(is_even(r->encoding()), "not double!");
+  vstmdb_f64(sp, DoubleFloatRegSet(r).bits());
+}
+
+void InterpreterMacroAssembler::pop(TosState state) {
+  switch (state) {
+  case atos: pop_ptr();                 break;
+  case btos:
+  case ztos:
+  case ctos:
+  case stos:
+  case itos: pop_i();                   break;
+  case ltos: pop_l();                   break;
+  case ftos:
+    if(hasFPU()) {
+        pop_f();
+    } else {
+        pop_i();
+    }
+    break;
+  case dtos:
+    if(hasFPU()) {
+        pop_d();
+    } else {
+        pop_l();
+    }
+    break;
+  case vtos: /* nothing to do */        break;
+  default:   ShouldNotReachHere();
+  }
+  verify_oop(r0, state);
+}
+
+void InterpreterMacroAssembler::push(TosState state) {
+  verify_oop(r0, state);
+  switch (state) {
+  case atos: push_ptr();                break;
+  case btos:
+  case ztos:
+  case ctos:
+  case stos:
+  case itos: push_i();                  break;
+  case ltos: push_l();                  break;
+  case ftos:
+    if(hasFPU()) {
+        push_f();
+    } else {
+        push_i();
+    }
+    break;
+  case dtos:
+    if(hasFPU()) {
+        push_d();
+    } else {
+        push_l();
+    }
+    break;
+  case vtos: /* nothing to do */        break;
+  default  : ShouldNotReachHere();
+  }
+}
+
+// Helpers for swap and dup
+void InterpreterMacroAssembler::load_ptr(int n, Register val) {
+  ldr(val, Address(sp, Interpreter::expr_offset_in_bytes(n)));
+}
+
+void InterpreterMacroAssembler::store_ptr(int n, Register val) {
+  str(val, Address(sp, Interpreter::expr_offset_in_bytes(n)));
+}
+
+// Load ftos/dtos from given address
+void InterpreterMacroAssembler::load_float(Address src) {
+  if (hasFPU()) {
+    vldr_f32(f0, src);
+  } else {
+    ldr(r0, src);
+  }
+}
+
+void InterpreterMacroAssembler::load_double(Address src) {
+  if (hasFPU()) {
+    vldr_f64(d0, src);
+  } else {
+    ldrd(r0, r1, src);
+  }
+}
+
+void InterpreterMacroAssembler::prepare_to_jump_from_interpreted() {
+  // set sender sp
+  mov(r4, sp);
+  // record last_sp
+  str(sp, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+}
+
+void print_method_name(Method* m, char * msg) {
+  if(MacroAssembler::enable_debug) {
+    printf("%s", msg);
+    fflush(stdout);
+    m->print_short_name();
+    printf("\n");
+    fflush(stdout);
+  }
+}
+
+// Jump to from_interpreted entry of a call unless single stepping is possible
+// in this thread in which case we must call the i2i entry
+void InterpreterMacroAssembler::jump_from_interpreted(Register method, Register temp) {
+  prepare_to_jump_from_interpreted();
+
+  if (JvmtiExport::can_post_interpreter_events()) {
+    Label run_compiled_code;
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+    // interp_only is an int, on little endian it is sufficient to test the byte only
+    // Is a cmpl faster?
+    ldr(temp, Address(rthread, JavaThread::interp_only_mode_offset()));
+    cbz(temp, run_compiled_code);
+    ldr(temp, Address(method, Method::interpreter_entry_offset()));
+    b(temp);
+    bind(run_compiled_code);
+  }
+
+  ldr(temp, Address(method, Method::from_interpreted_offset()));
+  b(temp);
+}
+
+// The following two routines provide a hook so that an implementation
+// can schedule the dispatch in two parts.  amd64 does not do this.
+void InterpreterMacroAssembler::dispatch_prolog(TosState state, int step) {
+}
+
+void InterpreterMacroAssembler::dispatch_epilog(TosState state, int step) {
+    dispatch_next(state, step);
+}
+
+void InterpreterMacroAssembler::dispatch_base(TosState state,
+                                              address* table,
+                                              bool verifyoop,
+                                              bool generate_poll) {
+  if (VerifyActivationFrameSize) {
+    Unimplemented();
+  }
+  if (verifyoop) {
+    verify_oop(r0, state);
+  }
+
+  /* Debugging code */
+  bytecode_seen(rscratch1, r3);
+
+  /*{
+    Label skip;
+
+    mov(r3, (address)&MacroAssembler::bytecodes_executed);
+    ldr(r2, r3);
+    add(r2, r2, 1);
+    str(r2, r3);
+    // Print out every 16384 (needs to be a power of two).
+    mov(r3, 16384 - 1);
+    tst(r2, r3);
+    b(skip, Assembler::NE);
+    reg_printf_important("Executed %d bytecodes.\n", r2);
+    bind(skip);
+  }*/
+
+
+  /*mov(r3, (address)&MacroAssembler::bytecodes_until_print);
+  ldr(r2, Address(r3));
+  cmp(r2, 0);
+
+  sub(r2, r2, 1, Assembler::NE);
+  str(r2, Address(r3), Assembler::NE);
+
+  mov(r2, 1, Assembler::EQ);
+  mov(r3, (address)&MacroAssembler::enable_debug, Assembler::EQ);
+  str(r2, Address(r3), Assembler::EQ);
+
+  mov(r3, (address)&MacroAssembler::enable_method_debug, Assembler::EQ);
+  str(r2, Address(r3), Assembler::EQ);*/
+
+  /*Label end;
+  cmp(r2, 0);
+  b(end, Assembler::NE);
+  stop("got to end of bytecodes");
+  bind(end);*/
+
+  get_bytecode(r14, rscratch1);
+  reg_printf("Dispatching bytecode %s (%d) @ BCP = %p\n", r14, rscratch1, rbcp);
+  /* End debugging code */
+
+  Label safepoint;
+  address* const safepoint_table = Interpreter::safept_table(state);
+  bool needs_thread_local_poll = generate_poll &&
+    SafepointMechanism::uses_thread_local_poll() && table != safepoint_table;
+
+  if (needs_thread_local_poll) {
+    NOT_PRODUCT(block_comment("Thread-local Safepoint poll"));
+    ldr(rscratch2, Address(rthread, Thread::polling_page_offset()));
+    tbnz(rscratch2, exact_log2(SafepointMechanism::poll_bit()), safepoint);
+  }
+
+  if (table == Interpreter::dispatch_table(state)) {
+    add(rscratch2, rscratch1, Interpreter::distance_from_dispatch_table(state));
+    ldr(r15_pc, Address(rdispatch, rscratch2, lsl(2)));
+  } else {
+    mov(rscratch2, (address)table);
+    ldr(r15_pc, Address(rscratch2, rscratch1, lsl(2)));
+  }
+
+  if (needs_thread_local_poll) {
+    bind(safepoint);
+    lea(rscratch2, ExternalAddress((address)safepoint_table));
+    ldr(r15_pc, Address(rscratch2, rscratch1, lsl(2)));
+  }
+}
+
+void InterpreterMacroAssembler::dispatch_only(TosState state, bool generate_poll) {
+  dispatch_base(state, Interpreter::dispatch_table(state), true, generate_poll);
+}
+
+void InterpreterMacroAssembler::dispatch_only_normal(TosState state) {
+  dispatch_base(state, Interpreter::normal_table(state));
+}
+
+void InterpreterMacroAssembler::dispatch_only_noverify(TosState state) {
+  dispatch_base(state, Interpreter::normal_table(state), false);
+}
+
+
+void InterpreterMacroAssembler::dispatch_next(TosState state, int step, bool generate_poll) {
+  // load next bytecode
+  ldrb(rscratch1, Address(pre(rbcp, step)));
+  dispatch_base(state, Interpreter::dispatch_table(state), generate_poll);
+}
+
+void InterpreterMacroAssembler::dispatch_via(TosState state, address* table) {
+  // load current bytecode
+  ldrb(rscratch1, Address(rbcp, 0));
+  dispatch_base(state, table);
+}
+
+// remove activation
+//
+// Unlock the receiver if this is a synchronized method.
+// Unlock any Java monitors from syncronized blocks.
+// Remove the activation from the stack.
+//
+// If there are locked Java monitors
+//    If throw_monitor_exception
+//       throws IllegalMonitorStateException
+//    Else if install_monitor_exception
+//       installs IllegalMonitorStateException
+//    Else
+//       no error processing
+void InterpreterMacroAssembler::remove_activation(
+        TosState state,
+        bool throw_monitor_exception,
+        bool install_monitor_exception,
+        bool notify_jvmdi) {
+  // Note: Registers r3 xmm0 may be in use for the
+  // result check if synchronized method
+  Label unlocked, unlock, no_unlock;
+
+  // get the value of _do_not_unlock_if_synchronized into r3
+  const Address do_not_unlock_if_synchronized(rthread,
+    in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  ldrb(r3, do_not_unlock_if_synchronized);
+  mov(rscratch1, 0);
+  strb(rscratch1, do_not_unlock_if_synchronized); // reset the flag
+
+ // get method access flags
+  ldr(rscratch1, Address(rfp, frame::get_interpreter_frame_method_offset() * wordSize));
+  ldr(r2, Address(rscratch1, Method::access_flags_offset()));
+  tst(r2, JVM_ACC_SYNCHRONIZED);
+  b(unlocked, Assembler::EQ);
+
+  // Don't unlock anything if the _do_not_unlock_if_synchronized flag
+  // is set.
+  cbnz(r3, no_unlock);
+
+  // unlock monitor
+  push(state); // save result
+
+  // BasicObjectLock will be first in list, since this is a
+  // synchronized method. However, need to check that the object has
+  // not been unlocked by an explicit monitorexit bytecode.
+  const Address monitor(rfp, frame::get_interpreter_frame_initial_sp_offset() *
+                        wordSize - (int) sizeof(BasicObjectLock));
+  // We use c_rarg1 so that if we go slow path it will be the correct
+  // register for unlock_object to pass to VM directly
+  lea(c_rarg1, monitor); // address of first monitor
+
+  ldr(r0, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+  cbnz(r0, unlock);
+
+  pop(state);
+  if (throw_monitor_exception) {
+    // Entry already unlocked, need to throw exception
+    call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+    should_not_reach_here();
+  } else {
+    // Monitor already unlocked during a stack unroll. If requested,
+    // install an illegal_monitor_state_exception.  Continue with
+    // stack unrolling.
+    if (install_monitor_exception) {
+      call_VM(noreg, CAST_FROM_FN_PTR(address,
+                     InterpreterRuntime::new_illegal_monitor_state_exception));
+    }
+    b(unlocked);
+  }
+
+  bind(unlock);
+  unlock_object(c_rarg1);
+  pop(state);
+
+  // Check that for block-structured locking (i.e., that all locked
+  // objects has been unlocked)
+  bind(unlocked);
+
+  // r0: Might contain return value
+  // FIXME r1 : Might contain the value too
+
+  // Check that all monitors are unlocked
+  {
+    Label loop, exception, entry, restart;
+    const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+    const Address monitor_block_top(
+        rfp, frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+    const Address monitor_block_bot(
+        rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize);
+
+    bind(restart);
+    // We can't use c_rarg1 as it might contain a result
+    ldr(c_rarg2, monitor_block_top); // points to current entry, starting
+                                     // with top-most entry
+    lea(r14, monitor_block_bot);  // points to word before bottom of
+                                  // monitor block
+    b(entry);
+
+    // Entry already locked, need to throw exception
+    bind(exception);
+
+    if (throw_monitor_exception) {
+      // Throw exception
+      MacroAssembler::call_VM(noreg,
+                              CAST_FROM_FN_PTR(address, InterpreterRuntime::
+                                   throw_illegal_monitor_state_exception));
+      should_not_reach_here();
+    } else {
+      // Stack unrolling. Unlock object and install illegal_monitor_exception.
+      // Unlock does not block, so don't have to worry about the frame.
+      // We don't have to preserve c_rarg1 since we are going to throw an exception.
+
+      push(state);
+      mov(c_rarg1, c_rarg2);
+      unlock_object(c_rarg1);
+      pop(state);
+
+      if (install_monitor_exception) {
+        call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                        InterpreterRuntime::
+                                        new_illegal_monitor_state_exception));
+      }
+
+      b(restart);
+    }
+
+    bind(loop);
+    // check if current entry is used
+    ldr(rscratch1, Address(c_rarg2, BasicObjectLock::obj_offset_in_bytes()));
+    cbnz(rscratch1, exception);
+
+    add(c_rarg2, c_rarg2, entry_size); // otherwise advance to next entry
+    bind(entry);
+    cmp(c_rarg2, r14); // check if bottom reached
+    b(loop, Assembler::NE); // if not at bottom then check this entry
+  }
+
+  bind(no_unlock);
+
+  // jvmti support
+  if (notify_jvmdi) {
+    notify_method_exit(state, NotifyJVMTI);    // preserve TOSCA
+  } else {
+    notify_method_exit(state, SkipNotifyJVMTI); // preserve TOSCA
+  }
+
+  if (StackReservedPages > 0) {
+    // testing if reserved zone needs to be re-enabled
+    Label no_reserved_zone_enabling;
+
+    ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
+    cmp(sp, rscratch1);
+    b(no_reserved_zone_enabling, Assembler::LS);
+
+    call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone), rthread);
+    call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_delayed_StackOverflowError));
+    should_not_reach_here();
+
+    bind(no_reserved_zone_enabling);
+  }
+
+  // remove activation
+  // get sender sp
+  ldr(rscratch1,
+      Address(rfp, frame::get_interpreter_frame_sender_sp_offset() * wordSize));
+  // remove frame anchor
+  leave();
+  // If we're returning to interpreted code we will shortly be
+  // adjusting SP to allow some space for ESP.  If we're returning to
+  // compiled code the saved sender SP was saved in sender_sp, so this
+  // restores it.
+  //bic(sp, rscratch1, 0xf); changed to not drop it as this is the sp
+  mov(sp, rscratch1);
+}
+
+// Lock object
+//
+// Args:
+//      c_rarg1: BasicObjectLock to be used for locking
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, .. (param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void InterpreterMacroAssembler::lock_object(Register lock_reg)
+{
+  reg_printf("LOCK:\n");
+  assert(lock_reg == c_rarg1, "The argument is only for looks. It must be c_rarg1");
+  if (UseHeavyMonitors) {
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
+            lock_reg);
+  } else {
+    Label done;
+
+    const Register swap_reg = r0;
+    const Register obj_reg = c_rarg3; // Will contain the oop
+
+    const int obj_offset = BasicObjectLock::obj_offset_in_bytes();
+    const int lock_offset = BasicObjectLock::lock_offset_in_bytes ();
+    const int mark_offset = lock_offset +
+                            BasicLock::displaced_header_offset_in_bytes();
+
+    Label slow_case;
+
+    // Load object pointer into obj_reg %c_rarg3
+    ldr(obj_reg, Address(lock_reg, obj_offset));
+
+    if (UseBiasedLocking) {
+      biased_locking_enter(obj_reg, swap_reg, rscratch2, rscratch1, false, done, &slow_case);
+    }
+
+    // Load (object->mark() | 1) into swap_reg
+    ldr(rscratch1, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+    orr(swap_reg, rscratch1, 1);
+
+    // Save (object->mark() | 1) into BasicLock's displaced header
+    str(swap_reg, Address(lock_reg, mark_offset));
+
+    assert(lock_offset == 0,
+           "displached header must be first word in BasicObjectLock");
+
+    Label fail;
+    if (PrintBiasedLockingStatistics) {
+      Label fast;
+      cmpxchg_obj_header(swap_reg, lock_reg, obj_reg, rscratch1, fast, &fail);
+      bind(fast);
+      atomic_inc(Address((address)BiasedLocking::fast_path_entry_count_addr()),
+                  rscratch2, rscratch1);
+      b(done);
+      bind(fail);
+    } else {
+      cmpxchg_obj_header(swap_reg, lock_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+    }
+
+    // Test if the oopMark is an obvious stack pointer, i.e.,
+    //  1) (mark & 7) == 0, and
+    //  2) rsp <= mark < mark + os::pagesize()
+    //
+    // These 3 tests can be done by evaluating the following
+    // expression: ((mark - rsp) & (7 - os::vm_page_size())),
+    // assuming both stack pointer and pagesize have their
+    // least significant 3 bits clear.
+    // NOTE: the oopMark is in swap_reg %r0 as the result of cmpxchg
+    // NOTE2: aarch32 does not like to subtract sp from rn so take a
+    // copy
+
+
+    //mov(rscratch1, sp);
+    //sub(swap_reg, swap_reg, rscratch1);
+    //ands(swap_reg, swap_reg, (unsigned long)(7 - os::vm_page_size()));
+    sub(swap_reg, swap_reg, sp);
+    mov(rscratch1, (os::vm_page_size() - 1) & ~0b11);
+    bics(swap_reg, swap_reg, rscratch1);
+
+    // Save the test result, for recursive case, the result is zero
+    str(swap_reg, Address(lock_reg, mark_offset));
+
+    if (PrintBiasedLockingStatistics) {
+      b(slow_case, Assembler::NE);
+      atomic_inc(Address((address)BiasedLocking::fast_path_entry_count_addr()),
+                  rscratch2, rscratch1);
+    }
+    b(done, Assembler::EQ);
+
+    bind(slow_case);
+
+    // Call the runtime routine for slow case
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorenter),
+            lock_reg);
+
+    bind(done);
+  }
+}
+
+
+// Unlocks an object. Used in monitorexit bytecode and
+// remove_activation.  Throws an IllegalMonitorException if object is
+// not locked by current thread.
+//
+// Args:
+//      c_rarg1: BasicObjectLock for lock
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, ... (param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void InterpreterMacroAssembler::unlock_object(Register lock_reg)
+{
+  assert(lock_reg == c_rarg1, "The argument is only for looks. It must be rarg1");
+
+  reg_printf("UNLOCK:\n");
+  if (UseHeavyMonitors) {
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit),
+            lock_reg);
+  } else {
+    Label done;
+
+    //create_breakpoint();
+    const Register swap_reg   = c_rarg0;
+    const Register header_reg = c_rarg2;  // Will contain the old oopMark
+    const Register obj_reg    = c_rarg3;  // Will contain the oop
+
+    save_bcp(); // Save in case of exception
+
+    // Convert from BasicObjectLock structure to object and BasicLock
+    // structure Store the BasicLock address into %r0
+    lea(swap_reg, Address(lock_reg, BasicObjectLock::lock_offset_in_bytes()));
+
+    // Load oop into obj_reg(%c_rarg3)
+    ldr(obj_reg, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes()));
+
+    // Free entry
+    mov(rscratch2, 0);
+    str(rscratch2, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes()));
+
+    if (UseBiasedLocking) {
+      biased_locking_exit(obj_reg, header_reg, done);
+    }
+
+    // Load the old header from BasicLock structure
+    ldr(header_reg, Address(swap_reg,
+                            BasicLock::displaced_header_offset_in_bytes()));
+
+    // Test for recursion
+    cbz(header_reg, done);
+
+    // Atomic swap back the old header
+    cmpxchg_obj_header(swap_reg, header_reg, obj_reg, rscratch1, done, /*fallthrough*/NULL);
+
+    // Call the runtime routine for slow case.
+    str(obj_reg, Address(lock_reg, BasicObjectLock::obj_offset_in_bytes())); // restore obj
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::monitorexit),
+            lock_reg);
+
+    bind(done);
+
+    restore_bcp();
+  }
+}
+
+void InterpreterMacroAssembler::test_method_data_pointer(Register mdp,
+                                                         Label& zero_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  ldr(mdp, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+  cbz(mdp, zero_continue);
+}
+
+// Set the method data pointer for the current bcp.
+void InterpreterMacroAssembler::set_method_data_pointer_for_bcp() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  Label set_mdp;
+  strd(r0, r1, Address(pre(sp, -2 * wordSize)));
+
+  // Test MDO to avoid the call if it is NULL.
+  ldr(r0, Address(rmethod, in_bytes(Method::method_data_offset())));
+  cbz(r0, set_mdp);
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::bcp_to_di), rmethod, rbcp);
+  // r0: mdi
+  // mdo is guaranteed to be non-zero here, we checked for it before the call.
+  ldr(r1, Address(rmethod, in_bytes(Method::method_data_offset())));
+  lea(r1, Address(r1, in_bytes(MethodData::data_offset())));
+  add(r0, r1, r0);
+  str(r0, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+  bind(set_mdp);
+  ldrd(r0, r1, Address(post(sp, 2 * wordSize)));
+}
+
+void InterpreterMacroAssembler::verify_method_data_pointer() {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+#ifdef ASSERT
+  Label verify_continue;
+  strd(r0, r1, Address(pre(sp, -2 * wordSize)));
+  strd(r2, r3, Address(pre(sp, -2 * wordSize)));
+  test_method_data_pointer(r3, verify_continue); // If mdp is zero, continue
+  get_method(r1);
+
+  // If the mdp is valid, it will point to a DataLayout header which is
+  // consistent with the bcp.  The converse is highly probable also.
+  ldrsh(r2, Address(r3, in_bytes(DataLayout::bci_offset())));
+  ldr(rscratch1, Address(r1, Method::const_offset()));
+  add(r2, r2, rscratch1);
+  lea(r2, Address(r2, ConstMethod::codes_offset()));
+  cmp(r2, rbcp);
+  b(verify_continue, Assembler::EQ);
+  // r1: method
+  // rbcp: bcp // rbcp == 22
+  // r3: mdp
+  call_VM_leaf(CAST_FROM_FN_PTR(address, InterpreterRuntime::verify_mdp),
+               r1, rbcp, r3);
+  bind(verify_continue);
+  ldrd(r2, r3, Address(post(sp, 2 * wordSize)));
+  ldrd(r0, r1, Address(post(sp, 2 * wordSize)));
+#endif // ASSERT
+}
+
+
+void InterpreterMacroAssembler::set_mdp_data_at(Register mdp_in,
+                                                int constant,
+                                                Register value) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  Address data(mdp_in, constant);
+  str(value, data);
+}
+
+
+void InterpreterMacroAssembler::increment_mdp_data_at(Register mdp_in,
+                                                      int constant,
+                                                      bool decrement) {
+  increment_mdp_data_at(mdp_in, noreg, constant, decrement);
+}
+
+void InterpreterMacroAssembler::increment_mdp_data_at(Register mdp_in,
+                                                      Register reg,
+                                                      int constant,
+                                                      bool decrement) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  // %%% this does 64bit counters at best it is wasting space
+  // at worst it is a rare bug when counters overflow
+
+  assert_different_registers(rscratch2, rscratch1, mdp_in, reg);
+
+  Address addr1(mdp_in, constant);
+  Address addr2(rscratch2, reg, lsl(0));
+  Address &addr = addr1;
+  if (reg != noreg) {
+    lea(rscratch2, addr1);
+    addr = addr2;
+  }
+
+  if (decrement) {
+    // Decrement the register.  Set condition codes.
+    // Intel does this
+    // addptr(data, (int32_t) -DataLayout::counter_increment);
+    // If the decrement causes the counter to overflow, stay negative
+    // Label L;
+    // jcc(Assembler::negative, L);
+    // addptr(data, (int32_t) DataLayout::counter_increment);
+    // so we do this
+    ldr(rscratch1, addr);
+    subs(rscratch1, rscratch1, (unsigned)DataLayout::counter_increment);
+    Label L;
+    b(L, Assembler::LO);       // skip store if counter underflow
+    str(rscratch1, addr);
+    bind(L);
+  } else {
+    assert(DataLayout::counter_increment == 1,
+           "flow-free idiom only works with 1");
+    // Intel does this
+    // Increment the register.  Set carry flag.
+    // addptr(data, DataLayout::counter_increment);
+    // If the increment causes the counter to overflow, pull back by 1.
+    // sbbptr(data, (int32_t)0);
+    // so we do this
+    ldr(rscratch1, addr);
+    adds(rscratch1, rscratch1, DataLayout::counter_increment);
+    Label L;
+    b(L, Assembler::CS);       // skip store if counter overflow
+    str(rscratch1, addr);
+    bind(L);
+  }
+}
+
+void InterpreterMacroAssembler::set_mdp_flag_at(Register mdp_in,
+                                                int flag_byte_constant) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  int flags_offset = in_bytes(DataLayout::flags_offset());
+  // Set the flag
+  ldrb(rscratch1, Address(mdp_in, flags_offset));
+  orr(rscratch1, rscratch1, flag_byte_constant);
+  strb(rscratch1, Address(mdp_in, flags_offset));
+}
+
+
+void InterpreterMacroAssembler::test_mdp_data_at(Register mdp_in,
+                                                 int offset,
+                                                 Register value,
+                                                 Register test_value_out,
+                                                 Label& not_equal_continue) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  if (test_value_out == noreg) {
+    ldr(rscratch1, Address(mdp_in, offset));
+    cmp(value, rscratch1);
+  } else {
+    // Put the test value into a register, so caller can use it:
+    ldr(test_value_out, Address(mdp_in, offset));
+    cmp(value, test_value_out);
+  }
+  b(not_equal_continue, Assembler::NE);
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in,
+                                                     int offset_of_disp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  ldr(rscratch1, Address(mdp_in, offset_of_disp));
+  add(mdp_in, mdp_in, rscratch1);
+  str(mdp_in, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_offset(Register mdp_in,
+                                                     Register reg,
+                                                     int offset_of_disp) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  lea(rscratch1, Address(mdp_in, offset_of_disp));
+  ldr(rscratch1, Address(rscratch1, reg, lsl()));
+  add(mdp_in, mdp_in, rscratch1);
+  str(mdp_in, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_by_constant(Register mdp_in,
+                                                       int constant) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  add(mdp_in, mdp_in, (unsigned) constant);
+  str(mdp_in, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+}
+
+
+void InterpreterMacroAssembler::update_mdp_for_ret(Register return_bci) {
+  assert(ProfileInterpreter, "must be profiling interpreter");
+  // save/restore across call_VM
+  mov(rscratch1, 0);
+  strd(rscratch1, return_bci, Address(pre(sp, -2 * wordSize)));
+  call_VM(noreg,
+          CAST_FROM_FN_PTR(address, InterpreterRuntime::update_mdp_for_ret),
+          return_bci);
+  ldrd(rscratch1, return_bci, Address(post(sp, 2 * wordSize)));
+}
+
+
+void InterpreterMacroAssembler::profile_taken_branch(Register mdp,
+                                                     Register bumped_count) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    // Otherwise, assign to mdp
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch.  Increment the taken count.
+    // We inline increment_mdp_data_at to return bumped_count in a register
+    //increment_mdp_data_at(mdp, in_bytes(JumpData::taken_offset()));
+    Address data(mdp, in_bytes(JumpData::taken_offset()));
+    ldr(bumped_count, data);
+    assert(DataLayout::counter_increment == 1,
+            "flow-free idiom only works with 1");
+    // Intel does this to catch overflow
+    // addptr(bumped_count, DataLayout::counter_increment);
+    // sbbptr(bumped_count, 0);
+    // so we do this
+    adds(bumped_count, bumped_count, DataLayout::counter_increment);
+    Label L;
+    b(L, Assembler::CS);       // skip store if counter overflow
+    str(bumped_count, data);
+    bind(L);
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_offset(mdp, in_bytes(JumpData::displacement_offset()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_not_taken_branch(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are taking a branch.  Increment the not taken count.
+    increment_mdp_data_at(mdp, in_bytes(BranchData::not_taken_offset()));
+
+    // The method data pointer needs to be updated to correspond to
+    // the next bytecode
+    update_mdp_by_constant(mdp, in_bytes(BranchData::branch_data_size()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_call(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp, in_bytes(CounterData::counter_data_size()));
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_final_call(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // We are making a call.  Increment the count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    // The method data pointer needs to be updated to reflect the new target.
+    update_mdp_by_constant(mdp,
+                           in_bytes(VirtualCallData::
+                                    virtual_call_data_size()));
+    bind(profile_continue);
+  }
+}
+
+
+void InterpreterMacroAssembler::profile_virtual_call(Register receiver,
+                                                     Register mdp,
+                                                     Register reg2,
+                                                     bool receiver_can_be_null) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    Label skip_receiver_profile;
+    if (receiver_can_be_null) {
+      Label not_null;
+      // We are making a call.  Increment the count for null receiver.
+      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+      b(skip_receiver_profile);
+      bind(not_null);
+    }
+
+    // Record the receiver type.
+    record_klass_in_profile(receiver, mdp, reg2, true);
+    bind(skip_receiver_profile);
+
+    // The method data pointer needs to be updated to reflect the new target.
+#if INCLUDE_JVMCI
+    if (MethodProfileWidth == 0) {
+      update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
+    }
+#else // INCLUDE_JVMCI
+    update_mdp_by_constant(mdp,
+                           in_bytes(VirtualCallData::
+                                    virtual_call_data_size()));
+#endif // INCLUDE_JVMCI
+    bind(profile_continue);
+  }
+}
+
+#if INCLUDE_JVMCI
+void InterpreterMacroAssembler::profile_called_method(Register method, Register mdp, Register reg2) {
+  assert_different_registers(method, mdp, reg2);
+  if (ProfileInterpreter && MethodProfileWidth > 0) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    Label done;
+    record_item_in_profile_helper(method, mdp, reg2, 0, done, MethodProfileWidth,
+      &VirtualCallData::method_offset, &VirtualCallData::method_count_offset, in_bytes(VirtualCallData::nonprofiled_receiver_count_offset()));
+    bind(done);
+
+    update_mdp_by_constant(mdp, in_bytes(VirtualCallData::virtual_call_data_size()));
+    bind(profile_continue);
+  }
+}
+#endif // INCLUDE_JVMCI
+
+// This routine creates a state machine for updating the multi-row
+// type profile at a virtual call site (or other type-sensitive bytecode).
+// The machine visits each row (of receiver/count) until the receiver type
+// is found, or until it runs out of rows.  At the same time, it remembers
+// the location of the first empty row.  (An empty row records null for its
+// receiver, and can be allocated for a newly-observed receiver type.)
+// Because there are two degrees of freedom in the state, a simple linear
+// search will not work; it must be a decision tree.  Hence this helper
+// function is recursive, to generate the required tree structured code.
+// It's the interpreter, so we are trading off code space for speed.
+// See below for example code.
+void InterpreterMacroAssembler::record_klass_in_profile_helper(
+                                        Register receiver, Register mdp,
+                                        Register reg2, int start_row,
+                                        Label& done, bool is_virtual_call) {
+  if (TypeProfileWidth == 0) {
+    if (is_virtual_call) {
+      increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+    }
+#if INCLUDE_JVMCI
+    else if (EnableJVMCI) {
+      increment_mdp_data_at(mdp, in_bytes(ReceiverTypeData::nonprofiled_receiver_count_offset()));
+    }
+#endif // INCLUDE_JVMCI
+  } else {
+    int non_profiled_offset = -1;
+    if (is_virtual_call) {
+      non_profiled_offset = in_bytes(CounterData::count_offset());
+    }
+#if INCLUDE_JVMCI
+    else if (EnableJVMCI) {
+      non_profiled_offset = in_bytes(ReceiverTypeData::nonprofiled_receiver_count_offset());
+    }
+#endif // INCLUDE_JVMCI
+
+    record_item_in_profile_helper(receiver, mdp, reg2, 0, done, TypeProfileWidth,
+        &VirtualCallData::receiver_offset, &VirtualCallData::receiver_count_offset, non_profiled_offset);
+  }
+}
+
+void InterpreterMacroAssembler::record_item_in_profile_helper(Register item, Register mdp,
+                                        Register reg2, int start_row, Label& done, int total_rows,
+                                        OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn,
+                                        int non_profiled_offset) {
+  int last_row = total_rows - 1;
+  assert(start_row <= last_row, "must be work left to do");
+  // Test this row for both the item and for null.
+  // Take any of three different outcomes:
+  //   1. found item => increment count and goto done
+  //   2. found null => keep looking for case 1, maybe allocate this cell
+  //   3. found something else => keep looking for cases 1 and 2
+  // Case 3 is handled by a recursive call.
+  for (int row = start_row; row <= last_row; row++) {
+    Label next_test;
+    bool test_for_null_also = (row == start_row);
+
+    // See if the item is item[n].
+    int item_offset = in_bytes(item_offset_fn(row));
+    test_mdp_data_at(mdp, item_offset, item,
+                     (test_for_null_also ? reg2 : noreg),
+                     next_test);
+    // (Reg2 now contains the item from the CallData.)
+
+    // The item is item[n].  Increment count[n].
+    int count_offset = in_bytes(item_count_offset_fn(row));
+    increment_mdp_data_at(mdp, count_offset);
+    b(done);
+    bind(next_test);
+
+    if (test_for_null_also) {
+      Label found_null;
+      // Failed the equality check on item[n]...  Test for null.
+      if (start_row == last_row) {
+        // The only thing left to do is handle the null case.
+        if (non_profiled_offset >= 0) {
+          cbz(reg2, found_null);
+          // Item did not match any saved item and there is no empty row for it.
+          // Increment total counter to indicate polymorphic case.
+          increment_mdp_data_at(mdp, non_profiled_offset);
+          b(done);
+          bind(found_null);
+        } else {
+          cbnz(reg2, done);
+        }
+        break;
+      }
+      // Since null is rare, make it be the branch-taken case.
+      cbz(reg2,found_null);
+
+      // Put all the "Case 3" tests here.
+      record_item_in_profile_helper(item, mdp, reg2, start_row + 1, done, total_rows,
+        item_offset_fn, item_count_offset_fn, non_profiled_offset);
+
+      // Found a null.  Keep searching for a matching item,
+      // but remember that this is an empty (unused) slot.
+      bind(found_null);
+    }
+  }
+
+  // In the fall-through case, we found no matching item, but we
+  // observed the item[start_row] is NULL.
+
+  // Fill in the item field and increment the count.
+  int item_offset = in_bytes(item_offset_fn(start_row));
+  set_mdp_data_at(mdp, item_offset, item);
+  int count_offset = in_bytes(item_count_offset_fn(start_row));
+  mov(reg2, DataLayout::counter_increment);
+  set_mdp_data_at(mdp, count_offset, reg2);
+  if (start_row > 0) {
+    b(done);
+  }
+}
+
+// Example state machine code for three profile rows:
+//   // main copy of decision tree, rooted at row[1]
+//   if (row[0].rec == rec) { row[0].incr(); goto done; }
+//   if (row[0].rec != NULL) {
+//     // inner copy of decision tree, rooted at row[1]
+//     if (row[1].rec == rec) { row[1].incr(); goto done; }
+//     if (row[1].rec != NULL) {
+//       // degenerate decision tree, rooted at row[2]
+//       if (row[2].rec == rec) { row[2].incr(); goto done; }
+//       if (row[2].rec != NULL) { count.incr(); goto done; } // overflow
+//       row[2].init(rec); goto done;
+//     } else {
+//       // remember row[1] is empty
+//       if (row[2].rec == rec) { row[2].incr(); goto done; }
+//       row[1].init(rec); goto done;
+//     }
+//   } else {
+//     // remember row[0] is empty
+//     if (row[1].rec == rec) { row[1].incr(); goto done; }
+//     if (row[2].rec == rec) { row[2].incr(); goto done; }
+//     row[0].init(rec); goto done;
+//   }
+//   done:
+
+void InterpreterMacroAssembler::record_klass_in_profile(Register receiver,
+                                                        Register mdp, Register reg2,
+                                                        bool is_virtual_call) {
+  assert(ProfileInterpreter, "must be profiling");
+  Label done;
+
+  record_klass_in_profile_helper(receiver, mdp, reg2, 0, done, is_virtual_call);
+
+  bind (done);
+}
+
+void InterpreterMacroAssembler::profile_ret(Register return_bci,
+                                            Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+    uint row;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the total ret count.
+    increment_mdp_data_at(mdp, in_bytes(CounterData::count_offset()));
+
+    for (row = 0; row < RetData::row_limit(); row++) {
+      Label next_test;
+
+      // See if return_bci is equal to bci[n]:
+      test_mdp_data_at(mdp,
+                       in_bytes(RetData::bci_offset(row)),
+                       return_bci, noreg,
+                       next_test);
+
+      // return_bci is equal to bci[n].  Increment the count.
+      increment_mdp_data_at(mdp, in_bytes(RetData::bci_count_offset(row)));
+
+      // The method data pointer needs to be updated to reflect the new target.
+      update_mdp_by_offset(mdp,
+                           in_bytes(RetData::bci_displacement_offset(row)));
+      b(profile_continue);
+      bind(next_test);
+    }
+
+    update_mdp_for_ret(return_bci);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_null_seen(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    set_mdp_flag_at(mdp, BitData::null_seen_byte_constant());
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_typecheck_failed(Register mdp) {
+  if (ProfileInterpreter && TypeProfileCasts) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    int count_offset = in_bytes(CounterData::count_offset());
+    // Back up the address, since we have already bumped the mdp.
+    count_offset -= in_bytes(VirtualCallData::virtual_call_data_size());
+
+    // *Decrement* the counter.  We expect to see zero or small negatives.
+    increment_mdp_data_at(mdp, count_offset, true);
+
+    bind (profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_typecheck(Register mdp, Register klass, Register reg2) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // The method data pointer needs to be updated.
+    int mdp_delta = in_bytes(BitData::bit_data_size());
+    if (TypeProfileCasts) {
+      mdp_delta = in_bytes(VirtualCallData::virtual_call_data_size());
+
+      // Record the object type.
+      record_klass_in_profile(klass, mdp, reg2, false);
+    }
+    update_mdp_by_constant(mdp, mdp_delta);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_switch_default(Register mdp) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Update the default case count
+    increment_mdp_data_at(mdp,
+                          in_bytes(MultiBranchData::default_count_offset()));
+
+    // The method data pointer needs to be updated.
+    update_mdp_by_offset(mdp,
+                         in_bytes(MultiBranchData::
+                                  default_displacement_offset()));
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_switch_case(Register index,
+                                                    Register mdp,
+                                                    Register reg2) {
+  if (ProfileInterpreter) {
+    Label profile_continue;
+
+    // If no method data exists, go to profile_continue.
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Build the base (index * per_case_size_in_bytes()) +
+    // case_array_offset_in_bytes()
+    mov(reg2, in_bytes(MultiBranchData::per_case_size()));
+    mov(rscratch1, in_bytes(MultiBranchData::case_array_offset()));
+    Assembler::mla(index, index, reg2, rscratch1);
+
+    // Update the case count
+    increment_mdp_data_at(mdp,
+                          index,
+                          in_bytes(MultiBranchData::relative_count_offset()));
+
+    // The method data pointer needs to be updated.
+    update_mdp_by_offset(mdp,
+                         index,
+                         in_bytes(MultiBranchData::
+                                  relative_displacement_offset()));
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::verify_oop(Register reg, TosState state) {
+  if (state == atos) {
+    MacroAssembler::verify_oop(reg);
+  }
+}
+
+void InterpreterMacroAssembler::verify_FPU(int stack_depth, TosState state) { ; }
+
+
+void InterpreterMacroAssembler::notify_method_entry() {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (JvmtiExport::can_post_interpreter_events()) {
+    Label L;
+    ldr(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
+    cbz(r3, L);
+    call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                    InterpreterRuntime::post_method_entry));
+    bind(L);
+  }
+
+#ifdef DTRACE_ENABLED
+  {
+    SkipIfEqual skip(this, &DTraceMethodProbes, false);
+    get_method(c_rarg1);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
+                 rthread, c_rarg1);
+  }
+#endif
+
+  // RedefineClasses() tracing support for obsolete method entry
+  if (log_is_enabled(Trace, redefine, class, obsolete)) {
+    get_method(c_rarg1);
+    call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
+      rthread, c_rarg1);
+  }
+
+}
+
+
+void InterpreterMacroAssembler::notify_method_exit(
+    TosState state, NotifyMethodExitMode mode) {
+  // Whenever JVMTI is interp_only_mode, method entry/exit events are sent to
+  // track stack depth.  If it is possible to enter interp_only_mode we add
+  // the code to check if the event should be sent.
+  if (mode == NotifyJVMTI && JvmtiExport::can_post_interpreter_events()) {
+    Label L;
+    // Note: frame::interpreter_frame_result has a dependency on how the
+    // method result is saved across the call to post_method_exit. If this
+    // is changed then the interpreter_frame_result implementation will
+    // need to be updated too.
+
+    push(state);
+    ldr(r3, Address(rthread, JavaThread::interp_only_mode_offset()));
+    cbz(r3, L);
+    call_VM(noreg,
+            CAST_FROM_FN_PTR(address, InterpreterRuntime::post_method_exit));
+    bind(L);
+    pop(state);
+  }
+
+#ifdef DTRACE_ENABLED
+  {
+    SkipIfEqual skip(this, &DTraceMethodProbes, false);
+    push(state);
+    get_method(c_rarg1);
+    call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
+                 rthread, c_rarg1);
+    pop(state);
+  }
+#endif
+}
+
+
+// Jump if ((*counter_addr += increment) & mask) satisfies the condition.
+void InterpreterMacroAssembler::increment_mask_and_jump(Address counter_addr,
+                                                        int increment, Address mask,
+                                                        Register scratch, Register scratch2,
+                                                        bool preloaded, Condition cond,
+                                                        Label* where) {
+  if (!preloaded) {
+    ldr(scratch, counter_addr);
+  }
+  add(scratch, scratch, increment);
+  str(scratch, counter_addr);
+  ldr(scratch2, mask);
+  ands(scratch, scratch, scratch2);
+  if (where)
+    b(*where, cond);
+}
+
+void InterpreterMacroAssembler::call_VM_leaf_base(address entry_point,
+                                                  int number_of_arguments,
+                                                  Label *retaddr) {
+  // interpreter specific
+  //
+  // Note: No need to save/restore rbcp & rlocals pointer since these
+  //       are callee saved registers and no blocking/ GC can happen
+  //       in leaf calls.
+#ifdef ASSERT
+  {
+    Label L;
+    ldr(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+    cbz(rscratch1, L);
+    stop("InterpreterMacroAssembler::call_VM_leaf_base:"
+         " last_sp != NULL");
+    bind(L);
+  }
+#endif /* ASSERT */
+  // super call
+  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, retaddr);
+}
+
+void InterpreterMacroAssembler::call_VM_base(Register oop_result,
+                                             Register java_thread,
+                                             Register last_java_sp,
+                                             address  entry_point,
+                                             int      number_of_arguments,
+                                             bool     check_exceptions) {
+  // interpreter specific
+  //
+  // Note: Could avoid restoring locals ptr (callee saved) - however doesn't
+  //       really make a difference for these runtime calls, since they are
+  //       slow anyway. Btw., bcp must be saved/restored since it may change
+  //       due to GC.
+  // assert(java_thread == noreg , "not expecting a precomputed java thread");
+  save_bcp();
+#ifdef ASSERT
+  {
+    Label L;
+    ldr(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+    cbz(rscratch1, L);
+    stop("InterpreterMacroAssembler::call_VM_base:"
+         " last_sp != NULL");
+    bind(L);
+  }
+#endif /* ASSERT */
+  // super call
+  MacroAssembler::call_VM_base(oop_result, noreg, last_java_sp,
+                               entry_point, number_of_arguments,
+                               check_exceptions);
+// interpreter specific
+  restore_bcp();
+  //restore_locals();
+}
+
+void InterpreterMacroAssembler::profile_obj_type(Register obj, const Address& mdo_addr) {
+  assert_different_registers(obj, rscratch1);
+  Label update, next, none;
+
+  verify_oop(obj);
+
+  cbnz(obj, update);
+  orptr(mdo_addr, TypeEntries::null_seen);
+  b(next);
+
+  bind(update);
+  load_klass(obj, obj);
+
+  ldr(rscratch1, mdo_addr);
+  eor(obj, obj, rscratch1);
+  bics(rscratch1, obj, ~TypeEntries::type_klass_mask);
+  b(next, Assembler::EQ); // klass seen before, nothing to
+                           // do. The unknown bit may have been
+                           // set already but no need to check.
+
+  tst(obj, TypeEntries::type_unknown);
+  b(next, Assembler::NE); // already unknown. Nothing to do anymore.
+
+  ldr(rscratch1, mdo_addr);
+  cbz(rscratch1, none);
+  cmp(rscratch1, TypeEntries::null_seen);
+  b(none, Assembler::EQ);
+  // There is a chance that the checks above (re-reading profiling
+  // data from memory) fail if another thread has just set the
+  // profiling to this obj's klass
+  ldr(rscratch1, mdo_addr);
+  eor(obj, obj, rscratch1);
+  bics(rscratch1, obj, ~TypeEntries::type_klass_mask);
+  b(next, Assembler::EQ);
+
+  // different than before. Cannot keep accurate profile.
+  orptr(mdo_addr, TypeEntries::type_unknown);
+  b(next);
+
+  bind(none);
+  // first time here. Set profile type.
+  str(obj, mdo_addr);
+
+  bind(next);
+}
+
+void InterpreterMacroAssembler::profile_arguments_type(Register mdp, Register callee, Register tmp, bool is_virtual) {
+  if (!ProfileInterpreter) {
+    return;
+  }
+
+  if (MethodData::profile_arguments() || MethodData::profile_return()) {
+    Label profile_continue;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    int off_to_start = is_virtual ? in_bytes(VirtualCallData::virtual_call_data_size()) : in_bytes(CounterData::counter_data_size());
+
+    ldrb(rscratch1, Address(mdp, in_bytes(DataLayout::tag_offset()) - off_to_start));
+    cmp(rscratch1, is_virtual ? DataLayout::virtual_call_type_data_tag : DataLayout::call_type_data_tag);
+    b(profile_continue, Assembler::NE);
+
+    if (MethodData::profile_arguments()) {
+      Label done;
+      int off_to_args = in_bytes(TypeEntriesAtCall::args_data_offset());
+      add(mdp, mdp, off_to_args);
+
+      for (int i = 0; i < TypeProfileArgsLimit; i++) {
+        if (i > 0 || MethodData::profile_return()) {
+          // If return value type is profiled we may have no argument to profile
+          ldr(tmp, Address(mdp, in_bytes(TypeEntriesAtCall::cell_count_offset())-off_to_args));
+          sub(tmp, tmp, i*TypeStackSlotEntries::per_arg_count());
+          cmp(tmp, TypeStackSlotEntries::per_arg_count());
+          b(done, Assembler::LT);
+        }
+        ldr(tmp, Address(callee, Method::const_offset()));
+        load_unsigned_short(tmp, Address(tmp, ConstMethod::size_of_parameters_offset()));
+        // stack offset o (zero based) from the start of the argument
+        // list, for n arguments translates into offset n - o - 1 from
+        // the end of the argument list
+        ldr(rscratch1, Address(mdp, in_bytes(TypeEntriesAtCall::stack_slot_offset(i))-off_to_args));
+        sub(tmp, tmp, rscratch1);
+        sub(tmp, tmp, 1);
+        Address arg_addr = argument_address(tmp);
+        ldr(tmp, arg_addr);
+
+        Address mdo_arg_addr(mdp, in_bytes(TypeEntriesAtCall::argument_type_offset(i))-off_to_args);
+        profile_obj_type(tmp, mdo_arg_addr);
+
+        int to_add = in_bytes(TypeStackSlotEntries::per_arg_size());
+        add(mdp, mdp, to_add);
+        off_to_args += to_add;
+      }
+
+      if (MethodData::profile_return()) {
+        ldr(tmp, Address(mdp, in_bytes(TypeEntriesAtCall::cell_count_offset())-off_to_args));
+        sub(tmp, tmp, TypeProfileArgsLimit*TypeStackSlotEntries::per_arg_count());
+      }
+
+      bind(done);
+
+      if (MethodData::profile_return()) {
+        // We're right after the type profile for the last
+        // argument. tmp is the number of cells left in the
+        // CallTypeData/VirtualCallTypeData to reach its end. Non null
+        // if there's a return to profile.
+        assert(ReturnTypeEntry::static_cell_count() < TypeStackSlotEntries::per_arg_count(), "can't move past ret type");
+        add(mdp, mdp, tmp, lsl(exact_log2(DataLayout::cell_size)));
+      }
+      str(mdp, Address(rfp, frame::get_interpreter_frame_mdp_offset() * wordSize));
+    } else {
+      assert(MethodData::profile_return(), "either profile call args or call ret");
+      update_mdp_by_constant(mdp, in_bytes(TypeEntriesAtCall::return_only_size()));
+    }
+
+    // mdp points right after the end of the
+    // CallTypeData/VirtualCallTypeData, right after the cells for the
+    // return value type if there's one
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_return_type(Register mdp, Register ret, Register tmp) {
+  assert_different_registers(mdp, ret, tmp, rbcp);
+  if (ProfileInterpreter && MethodData::profile_return()) {
+    Label profile_continue, done;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    if (MethodData::profile_return_jsr292_only()) {
+      assert(Method::intrinsic_id_size_in_bytes() == 2, "assuming Method::_intrinsic_id is u2");
+
+      // If we don't profile all invoke bytecodes we must make sure
+      // it's a bytecode we indeed profile. We can't go back to the
+      // begining of the ProfileData we intend to update to check its
+      // type because we're right after it and we don't known its
+      // length
+      Label do_profile;
+      ldrb(rscratch1, Address(rbcp, 0));
+      cmp(rscratch1, Bytecodes::_invokedynamic);
+      b(do_profile, Assembler::EQ);
+      cmp(rscratch1, Bytecodes::_invokehandle);
+      b(do_profile, Assembler::EQ);
+      get_method(tmp);
+      ldrh(rscratch1, Address(tmp, Method::intrinsic_id_offset_in_bytes()));
+      mov(tmp, vmIntrinsics::_compiledLambdaForm);
+      cmp(rscratch1, tmp);
+      b(profile_continue, Assembler::NE);
+
+      bind(do_profile);
+    }
+
+    Address mdo_ret_addr(mdp, -in_bytes(ReturnTypeEntry::size()));
+    mov(tmp, ret);
+    profile_obj_type(tmp, mdo_ret_addr);
+
+    bind(profile_continue);
+  }
+}
+
+void InterpreterMacroAssembler::profile_parameters_type(Register mdp, Register tmp1, Register tmp2) {
+  assert_different_registers(rscratch1, rscratch2, mdp, tmp1, tmp2);
+  if (ProfileInterpreter && MethodData::profile_parameters()) {
+    Label profile_continue, done;
+
+    test_method_data_pointer(mdp, profile_continue);
+
+    // Load the offset of the area within the MDO used for
+    // parameters. If it's negative we're not profiling any parameters
+    ldr(tmp1, Address(mdp, in_bytes(MethodData::parameters_type_data_di_offset()) - in_bytes(MethodData::data_offset())));
+    cmp(tmp1, 0u);
+    b(profile_continue, Assembler::LT);
+
+    // Compute a pointer to the area for parameters from the offset
+    // and move the pointer to the slot for the last
+    // parameters. Collect profiling from last parameter down.
+    // mdo start + parameters offset + array length - 1
+    add(mdp, mdp, tmp1);
+    ldr(tmp1, Address(mdp, ArrayData::array_len_offset()));
+    sub(tmp1, tmp1, TypeStackSlotEntries::per_arg_count());
+
+    Label loop;
+    bind(loop);
+
+    int off_base = in_bytes(ParametersTypeData::stack_slot_offset(0));
+    int type_base = in_bytes(ParametersTypeData::type_offset(0));
+    int per_arg_scale = exact_log2(DataLayout::cell_size);
+    add(rscratch1, mdp, off_base);
+    add(rscratch2, mdp, type_base);
+
+    Address arg_off(rscratch1, tmp1, lsl(per_arg_scale));
+    Address arg_type(rscratch2, tmp1, lsl(per_arg_scale));
+
+    // load offset on the stack from the slot for this parameter
+    ldr(tmp2, arg_off);
+    neg(tmp2, tmp2);
+    // read the parameter from the local area
+    ldr(tmp2, Address(rlocals, tmp2, lsl(Interpreter::logStackElementSize)));
+
+    // profile the parameter
+    profile_obj_type(tmp2, arg_type);
+
+    // go to next parameter
+    subs(tmp1, tmp1, TypeStackSlotEntries::per_arg_count());
+    b(loop, Assembler::GE);
+
+    bind(profile_continue);
+  }
+}
--- /dev/null	2018-09-25 19:25:03.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/interp_masm_aarch32.hpp	2018-09-25 19:25:03.000000000 +0300
@@ -0,0 +1,300 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_INTERP_MASM_AARCH32_64_HPP
+#define CPU_AARCH32_VM_INTERP_MASM_AARCH32_64_HPP
+
+#include "asm/macroAssembler.hpp"
+#include "interpreter/invocationCounter.hpp"
+#include "runtime/frame.hpp"
+
+// This file specializes the assember with interpreter-specific macros
+
+typedef ByteSize (*OffsetFunction)(uint);
+
+class InterpreterMacroAssembler: public MacroAssembler {
+ protected:
+  // Interpreter specific version of call_VM_base
+  using MacroAssembler::call_VM_leaf_base;
+
+ public:
+  virtual void call_VM_leaf_base(address entry_point,
+                                 int number_of_arguments,
+                                 Label *retaddr = NULL);
+ protected:
+  virtual void call_VM_base(Register oop_result,
+                            Register java_thread,
+                            Register last_java_sp,
+                            address  entry_point,
+                            int number_of_arguments,
+                            bool check_exceptions);
+
+  // base routine for all dispatches
+  void dispatch_base(TosState state, address* table,
+                     bool verifyoop = true, bool generate_poll = false);
+
+ public:
+  InterpreterMacroAssembler(CodeBuffer* code) : MacroAssembler(code) {}
+
+  void load_earlyret_value(TosState state);
+
+  void jump_to_entry(address entry);
+
+  virtual void check_and_handle_popframe(Register java_thread);
+  virtual void check_and_handle_earlyret(Register java_thread);
+
+  // Interpreter-specific registers
+  void save_bcp() {
+    str(rbcp, Address(rfp, frame::get_interpreter_frame_bcp_offset() * wordSize));
+  }
+
+  void restore_bcp() {
+    ldr(rbcp, Address(rfp, frame::get_interpreter_frame_bcp_offset() * wordSize));
+  }
+
+  void restore_locals() {
+    ldr(rlocals, Address(rfp, frame::get_interpreter_frame_locals_offset() * wordSize));
+  }
+
+  void restore_constant_pool_cache() {
+    ldr(rcpool, Address(rfp, frame::get_interpreter_frame_cache_offset() * wordSize));
+  }
+
+  void get_dispatch();
+
+  // Helpers for runtime call arguments/results
+
+  void get_method(Register reg) {
+    ldr(reg, Address(rfp, frame::get_interpreter_frame_method_offset() * wordSize));
+  }
+
+  void get_const(Register reg) {
+    get_method(reg);
+    ldr(reg, Address(reg, in_bytes(Method::const_offset())));
+  }
+
+  void get_constant_pool(Register reg) {
+    get_const(reg);
+    ldr(reg, Address(reg, in_bytes(ConstMethod::constants_offset())));
+  }
+
+  void get_constant_pool_cache(Register reg) {
+    get_constant_pool(reg);
+    ldr(reg, Address(reg, ConstantPool::cache_offset_in_bytes()));
+  }
+
+  void get_cpool_and_tags(Register cpool, Register tags) {
+    get_constant_pool(cpool);
+    ldr(tags, Address(cpool, ConstantPool::tags_offset_in_bytes()));
+  }
+
+  void get_unsigned_2_byte_index_at_bcp(Register reg, int bcp_offset);
+  void get_cache_and_index_at_bcp(Register cache, Register index, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_and_index_and_bytecode_at_bcp(Register cache, Register index, Register bytecode, int byte_no, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_entry_pointer_at_bcp(Register cache, Register tmp, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_cache_index_at_bcp(Register index, int bcp_offset, size_t index_size = sizeof(u2));
+  void get_method_counters(Register method, Register mcs, Label& skip);
+
+  // load cpool->resolved_references(index);
+  void load_resolved_reference_at_index(Register result, Register index, Register tmp = r5);
+
+  // load cpool->resolved_klass_at(index);
+  void load_resolved_klass_at_offset(Register cpool, Register index, Register klass, Register temp);
+
+  void pop_ptr(Register r = r0);
+  void pop_i(Register r = r0);
+  void pop_l(Register rLo = r0, Register rHi = r1);
+  void push_ptr(Register r = r0);
+  void push_i(Register r = r0);
+  void push_l(Register rLo = r0, Register rHi = r1);
+
+  void push_f(FloatRegister r = d0);
+  void push_d(FloatRegister r = d0);
+  void pop_f(FloatRegister r = d0);
+  void pop_d(FloatRegister r = d0);
+
+
+  void pop(Register r ) { ((MacroAssembler*)this)->pop(r); }
+
+  void push(Register r ) { ((MacroAssembler*)this)->push(r); }
+
+  void pop(TosState state); // transition vtos -> state
+  void push(TosState state); // transition state -> vtos
+
+  void pop(RegSet regs, Register stack) { ((MacroAssembler*)this)->pop(regs, stack); }
+  void push(RegSet regs, Register stack) { ((MacroAssembler*)this)->push(regs, stack); }
+
+  void empty_expression_stack() {
+    ldr(sp, Address(rfp, frame::get_interpreter_frame_monitor_block_top_offset() * wordSize));
+    // NULL last_sp until next java call
+    mov(rscratch1, 0);
+    str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  }
+
+  // Helpers for swap and dup
+  void load_ptr(int n, Register val);
+  void store_ptr(int n, Register val);
+
+  // Load ftos/dtos from given address
+  void load_float(Address src);
+  void load_double(Address src);
+  
+  // Generate a subtype check: branch to ok_is_subtype if sub_klass is
+  // a subtype of super_klass.
+  void gen_subtype_check( Register sub_klass, Label &ok_is_subtype );
+
+  // Dispatching
+  void dispatch_prolog(TosState state, int step = 0);
+  void dispatch_epilog(TosState state, int step = 0);
+  // dispatch via rscratch1
+  void dispatch_only(TosState state, bool generate_poll = false);
+  // dispatch normal table via rscratch1 (assume rscratch1 is loaded already)
+  void dispatch_only_normal(TosState state);
+  void dispatch_only_noverify(TosState state);
+  // load rscratch1 from [rbcp + step] and dispatch via rscratch1
+  void dispatch_next(TosState state, int step = 0, bool generate_poll = false);
+  // load rscratch1 from [esi] and dispatch via rscratch1 and table
+  void dispatch_via (TosState state, address* table);
+
+  // jump to an invoked target
+  void prepare_to_jump_from_interpreted();
+  void jump_from_interpreted(Register method, Register temp);
+
+
+  // Returning from interpreted functions
+  //
+  // Removes the current activation (incl. unlocking of monitors)
+  // and sets up the return address.  This code is also used for
+  // exception unwindwing. In that case, we do not want to throw
+  // IllegalMonitorStateExceptions, since that might get us into an
+  // infinite rethrow exception loop.
+  // Additionally this code is used for popFrame and earlyReturn.
+  // In popFrame case we want to skip throwing an exception,
+  // installing an exception, and notifying jvmdi.
+  // In earlyReturn case we only want to skip throwing an exception
+  // and installing an exception.
+  void remove_activation(TosState state,
+                         bool throw_monitor_exception = true,
+                         bool install_monitor_exception = true,
+                         bool notify_jvmdi = true);
+
+  // FIXME: Give us a valid frame at a null check.
+  virtual void null_check(Register reg, int offset = -1) {
+// #ifdef ASSERT
+//     save_bcp();
+//     set_last_Java_frame(sp, rfp, (address) pc());
+// #endif
+    MacroAssembler::null_check(reg, offset);
+// #ifdef ASSERT
+//     reset_last_Java_frame(true);
+// #endif
+  }
+
+  // Object locking
+  void lock_object  (Register lock_reg);
+  void unlock_object(Register lock_reg);
+
+  // Interpreter profiling operations
+  void set_method_data_pointer_for_bcp();
+  void test_method_data_pointer(Register mdp, Label& zero_continue);
+  void verify_method_data_pointer();
+
+  void set_mdp_data_at(Register mdp_in, int constant, Register value);
+  void increment_mdp_data_at(Address data, bool decrement = false);
+  void increment_mdp_data_at(Register mdp_in, int constant,
+                             bool decrement = false);
+  void increment_mdp_data_at(Register mdp_in, Register reg, int constant,
+                             bool decrement = false);
+  void increment_mask_and_jump(Address counter_addr,
+                               int increment, Address mask,
+                               Register scratch, Register scratch2,
+                               bool preloaded, Condition cond,
+                               Label* where);
+  void set_mdp_flag_at(Register mdp_in, int flag_constant);
+  void test_mdp_data_at(Register mdp_in, int offset, Register value,
+                        Register test_value_out,
+                        Label& not_equal_continue);
+
+  void record_klass_in_profile(Register receiver, Register mdp,
+                               Register reg2, bool is_virtual_call);
+  void record_klass_in_profile_helper(Register receiver, Register mdp,
+                                      Register reg2, int start_row,
+                                      Label& done, bool is_virtual_call);
+  void record_item_in_profile_helper(Register item, Register mdp,
+                                     Register reg2, int start_row, Label& done, int total_rows,
+                                     OffsetFunction item_offset_fn, OffsetFunction item_count_offset_fn,
+                                     int non_profiled_offset);
+
+  void update_mdp_by_offset(Register mdp_in, int offset_of_offset);
+  void update_mdp_by_offset(Register mdp_in, Register reg, int offset_of_disp);
+  void update_mdp_by_constant(Register mdp_in, int constant);
+  void update_mdp_for_ret(Register return_bci);
+
+  // narrow int return value
+  void narrow(Register result);
+
+  void profile_taken_branch(Register mdp, Register bumped_count);
+  void profile_not_taken_branch(Register mdp);
+  void profile_call(Register mdp);
+  void profile_final_call(Register mdp);
+  void profile_virtual_call(Register receiver, Register mdp,
+                            Register scratch2,
+                            bool receiver_can_be_null = false);
+  void profile_called_method(Register method, Register mdp, Register reg2) NOT_JVMCI_RETURN;
+  void profile_ret(Register return_bci, Register mdp);
+  void profile_null_seen(Register mdp);
+  void profile_typecheck(Register mdp, Register klass, Register scratch);
+  void profile_typecheck_failed(Register mdp);
+  void profile_switch_default(Register mdp);
+  void profile_switch_case(Register index_in_scratch, Register mdp,
+                           Register scratch2);
+
+  void profile_obj_type(Register obj, const Address& mdo_addr);
+  void profile_arguments_type(Register mdp, Register callee, Register tmp, bool is_virtual);
+  void profile_return_type(Register mdp, Register ret, Register tmp);
+  void profile_parameters_type(Register mdp, Register tmp1, Register tmp2);
+
+  // Debugging
+  // only if +VerifyOops && state == atos
+  void verify_oop(Register reg, TosState state = atos);
+  // only if +VerifyFPU  && (state == ftos || state == dtos)
+  void verify_FPU(int stack_depth, TosState state = ftos);
+
+  typedef enum { NotifyJVMTI, SkipNotifyJVMTI } NotifyMethodExitMode;
+
+  // support for jvmti/dtrace
+  void notify_method_entry();
+  void notify_method_exit(TosState state, NotifyMethodExitMode mode);
+
+  virtual void _call_Unimplemented(address call_site) {
+    save_bcp();
+    set_last_Java_frame(sp, rfp, (address) pc(), rscratch1);
+    MacroAssembler::_call_Unimplemented(call_site);
+  }
+};
+
+#endif // CPU_AARCH32_VM_INTERP_MASM_AARCH32_64_HPP
--- /dev/null	2018-09-25 19:25:04.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/interpreterRT_aarch32.cpp	2018-09-25 19:25:04.000000000 +0300
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "interpreter/interp_masm.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "memory/universe.hpp"
+#include "oops/method.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/signature.hpp"
+
+#define __ _masm->
+
+/*#define print_copy(name, off) \
+  __ mov(rscratch1, (address)name);\
+  __ mov(rscratch2, off);\
+  __ reg_printf("%s copied from offset %p + %d\n", rscratch1, from(), rscratch2);*/
+
+#define print_copy(name, off)
+
+// Implementation of SignatureHandlerGenerator
+Register InterpreterRuntime::SignatureHandlerGenerator::from() { return rlocals; }
+Register InterpreterRuntime::SignatureHandlerGenerator::to()   { return r4; }
+Register InterpreterRuntime::SignatureHandlerGenerator::temp() { return rscratch1; }
+
+InterpreterRuntime::SignatureHandlerGenerator::SignatureHandlerGenerator(
+  const methodHandle &method, CodeBuffer* buffer):
+  NativeSignatureIterator(method),
+  _next_double_dex(0),
+  _stack_offset(0)
+{
+  _masm = new MacroAssembler(buffer);
+  _num_int_args = (method->is_static() ? 1 : 0);
+  // See layout in interpreter_aarch32.cpp
+  _fp_arg_mask =  (1 <<(Argument::n_float_register_parameters_c * 3)) - 1;
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_int() {
+  print_copy(__FUNCTION__, Interpreter::local_offset_in_bytes(offset()));
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset()));
+
+  switch (_num_int_args) {
+  case 0:
+    __ ldr(c_rarg1, src);
+    _num_int_args++;
+    break;
+  case 1:
+    __ ldr(c_rarg2, src);
+    _num_int_args++;
+    break;
+  case 2:
+    __ ldr(c_rarg3, src);
+    _num_int_args++;
+    break;
+  default:
+    __ ldr(r0, src);
+    __ str(r0, Address(to(), _stack_offset));
+    _stack_offset += wordSize;
+    _num_int_args++;
+    break;
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_long() {
+  print_copy(__FUNCTION__, Interpreter::local_offset_in_bytes(offset() + 1));
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset() + 1));
+  // Needs to be aligned to even registers. Means also won't be split across
+  // registers and stack.
+
+  switch (_num_int_args) {
+  case 0:
+  case 1:
+    __ ldrd(c_rarg2, c_rarg3, src);
+    _num_int_args = 3; // force next args onto stack
+    break;
+  default:
+    __ ldrd(r0, temp(), src);
+    _stack_offset = (_stack_offset + 7) & ~7; // Align on 8-byte boundary
+    __ strd(r0, temp(), Address(to(), _stack_offset));
+    _stack_offset += 2 * wordSize;
+    _num_int_args += 2;
+    break;
+  }
+}
+
+#ifdef HARD_FLOAT_CC
+void InterpreterRuntime::SignatureHandlerGenerator::pass_float() {
+  print_copy(__FUNCTION__, Interpreter::local_offset_in_bytes(offset()));
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset()));
+
+    if (_fp_arg_mask & ((1 << Argument::n_float_register_parameters_c*2)-1)) {
+        unsigned index = __builtin_ctz(_fp_arg_mask);
+        __ vldr_f32(as_FloatRegister(index), src);
+        _fp_arg_mask &= ~(1 << index);
+        _next_double_dex += (~index) & 1;
+    } else {
+        __ ldr(r0, src);
+        __ str(r0, Address(to(), _stack_offset));
+        _stack_offset += wordSize;
+    }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_double() {
+  print_copy(__FUNCTION__, Interpreter::local_offset_in_bytes(offset() + 1));
+  const Address src(from(), Interpreter::local_offset_in_bytes(offset() + 1));
+
+    if (_next_double_dex < Argument::n_float_register_parameters_c) {
+        _fp_arg_mask &= ~((3 << _next_double_dex*2) | ((1 << _next_double_dex+16)));
+        __ vldr_f64(as_DoubleFloatRegister(_next_double_dex++), src);
+    } else {
+        __ ldrd(r0, temp(), src);
+        _stack_offset = (_stack_offset + 7) & ~7;
+        __ strd(r0, temp(), Address(to(), _stack_offset));
+        _stack_offset += 2 * wordSize;
+    }
+}
+#else
+// Just pass them in integer registers and on the stack as we would
+// any other argument
+void InterpreterRuntime::SignatureHandlerGenerator::pass_float() {
+  pass_int();
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_double() {
+  pass_long();
+}
+#endif //HARD_FLOAT_CC
+
+void InterpreterRuntime::SignatureHandlerGenerator::pass_object() {
+  print_copy(__FUNCTION__, Interpreter::local_offset_in_bytes(offset()));
+
+  switch (_num_int_args) {
+  case 0:
+    assert(offset() == 0, "argument register 1 can only be (non-null) receiver");
+    __ add(c_rarg1, from(), Interpreter::local_offset_in_bytes(offset()));
+    _num_int_args++;
+    break;
+  case 1:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg2, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg2, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+  case 2:
+    {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ mov(c_rarg3, 0);
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbz(temp(), L);
+      __ mov(c_rarg3, r0);
+      __ bind(L);
+      _num_int_args++;
+      break;
+    }
+ default:
+   {
+      __ add(r0, from(), Interpreter::local_offset_in_bytes(offset()));
+      __ ldr(temp(), r0);
+      Label L;
+      __ cbnz(temp(), L);
+      __ mov(r0, 0);
+      __ bind(L);
+      __ str(r0, Address(to(), _stack_offset));
+      _stack_offset += wordSize;
+      _num_int_args++;
+      break;
+   }
+  }
+}
+
+void InterpreterRuntime::SignatureHandlerGenerator::generate(uint64_t fingerprint) {
+  // generate code to handle arguments
+  iterate(fingerprint);
+
+  // return result handler
+  __ lea(r0, ExternalAddress(Interpreter::result_handler(method()->result_type())));
+  __ b(lr);
+
+  __ flush();
+}
+
+
+// Implementation of SignatureHandlerLibrary
+
+void SignatureHandlerLibrary::pd_set_handler(address handler) {}
+
+
+class SlowSignatureHandler : public NativeSignatureIterator {
+ private:
+  address   _from;
+  intptr_t* _to;
+  intptr_t* _int_args;
+  intptr_t* _fp_args;
+  intptr_t* _fp_identifiers;
+
+  int _num_int_reg_args;
+  int _next_double_dex;
+
+  virtual void pass_int()
+  {
+    jint from_obj = *(jint *)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if (_num_int_reg_args < Argument::n_int_register_parameters_c-1) {
+      *_int_args++ = from_obj;
+      _num_int_reg_args++;
+    } else {
+      *_to++ = from_obj;
+    }
+  }
+
+  virtual void pass_long()
+  {
+    intptr_t high_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+    intptr_t low_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    _from -= 2*Interpreter::stackElementSize;
+
+    if (_num_int_reg_args < Argument::n_int_register_parameters_c-2) {
+      // Passing longs. As c_rarg0 is always reserved for jni_env we could only
+      // possibly stash a long in r3:r2 due to alignment so we can only enter here
+      // with either zero or one parameters.
+      // Align to two
+      _int_args += 1 - _num_int_reg_args; // 0 or 1
+      *_int_args++ = low_obj;
+      *_int_args++ = high_obj;
+      _num_int_reg_args = 3;
+    } else {
+      _to = (intptr_t*)(((intptr_t)_to + 7) & ~7); // Align to eight bytes
+      *_to++ = low_obj;
+      *_to++ = high_obj;
+      _num_int_reg_args = 3;
+    }
+  }
+
+  virtual void pass_object()
+  {
+    intptr_t *from_addr = (intptr_t*)(_from + Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if (_num_int_reg_args < Argument::n_int_register_parameters_c-1) {
+      *_int_args++ = (*from_addr == 0) ? NULL : (intptr_t)from_addr;
+      _num_int_reg_args++;
+    } else {
+      *_to++ = (*from_addr == 0) ? NULL : (intptr_t) from_addr;
+    }
+  }
+#ifdef HARD_FLOAT_CC
+  virtual void pass_float()
+  {
+    jint from_obj = *(jint*)(_from+Interpreter::local_offset_in_bytes(0));
+    _from -= Interpreter::stackElementSize;
+
+    if ((*_fp_identifiers) & 0xffff) {
+      unsigned index = __builtin_ctz(*_fp_identifiers);
+      _fp_args[index] = from_obj;
+      *_fp_identifiers ^= 1 << index;
+      _next_double_dex += (~index) & 1;
+    } else {
+      *_to++ = from_obj;
+    }
+  }
+
+  virtual void pass_double()
+  {
+    intptr_t high_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(0));
+    intptr_t low_obj = *(intptr_t*)(_from+Interpreter::local_offset_in_bytes(1));
+    _from -= 2*Interpreter::stackElementSize;
+
+    if (_next_double_dex < Argument::n_float_register_parameters_c) {
+      //We can allocate to a register.
+      int index = _next_double_dex++;
+      *_fp_identifiers &= ~((3 << index*2) | (1 << index+16));
+      _fp_args[index*2] = low_obj;
+      _fp_args[index*2 + 1] = high_obj;
+    } else {
+      _to = (intptr_t*)(((intptr_t)_to + 7) & ~7); // Align to eight bytes
+      *_to++ = low_obj;
+      *_to++ = high_obj;
+    }
+  }
+#else
+  virtual void pass_float() { pass_int(); }
+  virtual void pass_double() { pass_long(); }
+#endif // HARD_FLOAT_CC
+
+ public:
+  SlowSignatureHandler(const methodHandle &method, address from, intptr_t* to)
+    : NativeSignatureIterator(method)
+  {
+    _from = from;
+    _to   = to;
+    // See layout in interpreter_aarch32.cpp
+    _int_args = to - (method->is_static() ? 19 : 20);
+    _fp_args =  to - 16; //each slot is for a double
+    _fp_identifiers = to - 21;
+    *_fp_identifiers = (1 <<(Argument::n_float_register_parameters_c * 3)) - 1;
+
+    _num_int_reg_args = (method->is_static() ? 1 : 0);
+    _next_double_dex = 0;
+  }
+};
+
+
+IRT_ENTRY(address,
+          InterpreterRuntime::slow_signature_handler(JavaThread* thread,
+                                                     Method* method,
+                                                     intptr_t* from,
+                                                     intptr_t* to))
+  methodHandle m(thread, (Method*)method);
+  assert(m->is_native(), "sanity check");
+
+  // handle arguments
+  SlowSignatureHandler ssh(m, (address)from, to);
+  ssh.iterate(UCONST64(-1));
+
+  // return result handler
+  return Interpreter::result_handler(m->result_type());
+IRT_END
--- /dev/null	2018-09-25 19:25:05.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/interpreterRT_aarch32.hpp	2018-09-25 19:25:05.000000000 +0300
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 1998, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_INTERPRETERRT_AARCH32_HPP
+#define CPU_AARCH32_VM_INTERPRETERRT_AARCH32_HPP
+
+// This is included in the middle of class Interpreter.
+// Do not include files here.
+
+// native method calls
+
+class SignatureHandlerGenerator: public NativeSignatureIterator {
+ private:
+  MacroAssembler* _masm;
+  unsigned int _fp_arg_mask;
+  int _num_int_args;
+  unsigned _next_double_dex;
+  int _stack_offset;
+
+  void pass_int();
+  void pass_long();
+  void pass_float();
+  void pass_double();
+  void pass_object();
+
+ public:
+  // Creation
+  SignatureHandlerGenerator(const methodHandle& method, CodeBuffer* buffer);
+
+  // Code generation
+  void generate(uint64_t fingerprint);
+
+  // Code generation support
+  static Register from();
+  static Register to();
+  static Register temp();
+};
+
+#endif // CPU_AARCH32_VM_INTERPRETERRT_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:06.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/javaFrameAnchor_aarch32.hpp	2018-09-25 19:25:06.000000000 +0300
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_JAVAFRAMEANCHOR_AARCH32_HPP
+#define CPU_AARCH32_VM_JAVAFRAMEANCHOR_AARCH32_HPP
+
+private:
+
+  // FP value associated with _last_Java_sp:
+  intptr_t* volatile        _last_Java_fp;           // pointer is volatile not what it points to
+
+public:
+  // Each arch must define reset, save, restore
+  // These are used by objects that only care about:
+  //  1 - initializing a new state (thread creation, javaCalls)
+  //  2 - saving a current state (javaCalls)
+  //  3 - restoring an old state (javaCalls)
+
+  void clear(void) {
+    // clearing _last_Java_sp must be first
+    _last_Java_sp = NULL;
+    OrderAccess::release();
+    _last_Java_fp = NULL;
+    _last_Java_pc = NULL;
+  }
+
+  void copy(JavaFrameAnchor* src) {
+    // In order to make sure the transition state is valid for "this"
+    // We must clear _last_Java_sp before copying the rest of the new data
+    //
+    // Hack Alert: Temporary bugfix for 4717480/4721647
+    // To act like previous version (pd_cache_state) don't NULL _last_Java_sp
+    // unless the value is changing
+    //
+    if (_last_Java_sp != src->_last_Java_sp) {
+      _last_Java_sp = NULL;
+      OrderAccess::release();
+    }
+    _last_Java_fp = src->_last_Java_fp;
+    _last_Java_pc = src->_last_Java_pc;
+    // Must be last so profiler will always see valid frame if has_last_frame() is true
+    _last_Java_sp = src->_last_Java_sp;
+  }
+
+  bool walkable(void)                            { return _last_Java_sp != NULL && _last_Java_pc != NULL; }
+  void make_walkable(JavaThread* thread);
+  void capture_last_Java_pc(void);
+
+  intptr_t* last_Java_sp(void) const             { return _last_Java_sp; }
+
+  address last_Java_pc(void)                     { return _last_Java_pc; }
+
+private:
+
+  static ByteSize last_Java_fp_offset()          { return byte_offset_of(JavaFrameAnchor, _last_Java_fp); }
+
+public:
+
+  void set_last_Java_sp(intptr_t* sp)            { _last_Java_sp = sp; OrderAccess::release(); }
+
+  intptr_t*   last_Java_fp(void)                     { return _last_Java_fp; }
+  // Assert (last_Java_sp == NULL || fp == NULL)
+  void set_last_Java_fp(intptr_t* fp)                { OrderAccess::release(); _last_Java_fp = fp; }
+
+#endif // CPU_AARCH32_VM_JAVAFRAMEANCHOR_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:07.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/jniFastGetField_aarch32.cpp	2018-09-25 19:25:07.000000000 +0300
@@ -0,0 +1,195 @@
+/*
+ * Copyright (c) 2004, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "prims/jniFastGetField.hpp"
+#include "prims/jvm_misc.hpp"
+#include "runtime/safepoint.hpp"
+
+#define __ masm->
+
+#define BUFFER_SIZE_ARMV7 31*wordSize
+#define BUFFER_SIZE_ARMV6 51*wordSize
+
+// Instead of issuing a LoadLoad barrier we create an address
+// dependency between loads; this might be more efficient.
+
+// Common register usage:
+// r0/v0:      result
+// c_rarg0:    jni env
+// c_rarg1:    obj
+// c_rarg2:    jfield id
+
+address JNI_FastGetField::generate_fast_get_int_field0(BasicType type) {
+  Register result   = c_rarg0;
+  Register robj     = c_rarg1;
+  Register rcounter = c_rarg3;
+  int      args     = RegSet::of(c_rarg0, c_rarg1, c_rarg2).bits();
+  int      nargs    = 3;
+
+  const char *name;
+  switch (type) {
+    case T_BOOLEAN: name = "jni_fast_GetBooleanField"; break;
+    case T_BYTE:    name = "jni_fast_GetByteField";    break;
+    case T_CHAR:    name = "jni_fast_GetCharField";    break;
+    case T_SHORT:   name = "jni_fast_GetShortField";   break;
+    case T_INT:     name = "jni_fast_GetIntField";     break;
+    case T_LONG:    name = "jni_fast_GetLongField";    break;
+    case T_FLOAT:   name = "jni_fast_GetFloatField";   break;
+    case T_DOUBLE:  name = "jni_fast_GetDoubleField";  break;
+    default:        ShouldNotReachHere(); name = ""; // unreachable
+  }
+  ResourceMark rm;
+  BufferBlob* blob = BufferBlob::create(name,
+                                        VM_Version::features() & FT_ARMV7 ?
+                                         BUFFER_SIZE_ARMV7 :
+                                         BUFFER_SIZE_ARMV6 );
+  CodeBuffer cbuf(blob);
+  MacroAssembler* masm = new MacroAssembler(&cbuf);
+  address fast_entry = __ pc();
+
+  Label slow;
+
+  __ lea(rcounter, SafepointSynchronize::safepoint_counter_addr());
+  __ ldr(rcounter, rcounter);
+  __ tst(rcounter, 1);
+  __ b(slow, Assembler::NE);
+  __ stmdb(sp, args);
+  // doesn't change c_rarg1 but does force a dependency on rcounter before
+  // performing __ ldr(robj, ...
+  __ eor(robj, c_rarg1, rcounter);
+  __ eor(robj, robj, rcounter);
+
+  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->try_resolve_jobject_in_native(masm, c_rarg0, robj, noreg, slow);
+
+  assert(count < LIST_CAPACITY, "LIST_CAPACITY too small");
+  speculative_load_pclist[count] = __ pc();   // Used by the segfault handler
+  // c_rarg2 * 2 is offset
+  // Only ldr & ldrb support shifted loads
+  switch (type) {
+    case T_FLOAT:
+    case T_INT:     __ ldr (result, Address(robj, c_rarg2, lsr(2))); break;
+    case T_BOOLEAN: __ ldrb(result, Address(robj, c_rarg2, lsr(2))); break;
+    default: {
+      __ lsr(c_rarg2, c_rarg2, 2);
+      switch(type) {
+        case T_BYTE:    __ ldrsb   (result, Address(robj, c_rarg2)); break;
+        case T_CHAR:    __ ldrh    (result, Address(robj, c_rarg2)); break;
+        case T_SHORT:   __ ldrsh   (result, Address(robj, c_rarg2)); break;
+        case T_DOUBLE:
+        case T_LONG:    __ ldrd    (result, Address(robj, c_rarg2)); break;
+        default:        ShouldNotReachHere();
+      }
+    }
+  }
+  __ lea(rscratch2, SafepointSynchronize::safepoint_counter_addr());
+  // rscratch2 is address dependent on result.
+  // TODO Do we need to force dependency on r1 too?
+  __ eor(rscratch2, rscratch2, result);
+  __ eor(rscratch2, rscratch2, result);
+  __ ldr(rscratch2, rscratch2);
+  __ cmp(rcounter, rscratch2);
+
+#ifdef HARD_FLOAT_CC
+  switch (type) {
+    case T_FLOAT:   __ vmov_f32(d0, result, Assembler::EQ); break;
+    case T_DOUBLE:  __ vmov_f64(d0, r0, r1, Assembler::EQ); break; // Change me if result changes
+    default:                                                break;
+  }
+#endif//HARD_FLOAT_CC
+
+  __ add(sp, sp, nargs * wordSize, Assembler::EQ); // Pop args if we don't need them.
+  __ b(lr, Assembler::EQ);
+
+  // Restore args for slowcase call into the vm
+  __ ldmia(sp, args);
+
+  // Slowcase
+  slowcase_entry_pclist[count++] = __ pc();
+  __ bind(slow);
+
+  address slow_case_addr = NULL;
+  switch (type) {
+    case T_BOOLEAN: slow_case_addr = jni_GetBooleanField_addr(); break;
+    case T_BYTE:    slow_case_addr = jni_GetByteField_addr();    break;
+    case T_CHAR:    slow_case_addr = jni_GetCharField_addr();    break;
+    case T_SHORT:   slow_case_addr = jni_GetShortField_addr();   break;
+    case T_INT:     slow_case_addr = jni_GetIntField_addr();     break;
+    case T_LONG:    slow_case_addr = jni_GetLongField_addr();    break;
+    case T_FLOAT:   slow_case_addr = jni_GetFloatField_addr();   break;
+    case T_DOUBLE:  slow_case_addr = jni_GetDoubleField_addr();  break;
+    default:        ShouldNotReachHere();
+  }
+
+  {
+    __ enter();
+    __ lea(rscratch2, ExternalAddress(slow_case_addr));
+    __ bl(rscratch2);
+    __ maybe_isb();
+    __ leave();
+    __ b(lr);
+  }
+  __ flush ();
+
+  return fast_entry;
+}
+
+address JNI_FastGetField::generate_fast_get_boolean_field() {
+  return generate_fast_get_int_field0(T_BOOLEAN);
+}
+
+address JNI_FastGetField::generate_fast_get_byte_field() {
+  return generate_fast_get_int_field0(T_BYTE);
+}
+
+address JNI_FastGetField::generate_fast_get_char_field() {
+  return generate_fast_get_int_field0(T_CHAR);
+}
+
+address JNI_FastGetField::generate_fast_get_short_field() {
+  return generate_fast_get_int_field0(T_SHORT);
+}
+
+address JNI_FastGetField::generate_fast_get_int_field() {
+  return generate_fast_get_int_field0(T_INT);
+}
+
+address JNI_FastGetField::generate_fast_get_long_field() {
+  return generate_fast_get_int_field0(T_LONG);
+}
+
+address JNI_FastGetField::generate_fast_get_float_field() {
+  return generate_fast_get_int_field0(T_FLOAT);
+}
+
+address JNI_FastGetField::generate_fast_get_double_field() {
+  return generate_fast_get_int_field0(T_DOUBLE);
+}
--- /dev/null	2018-09-25 19:25:08.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/jniTypes_aarch32.hpp	2018-09-25 19:25:08.000000000 +0300
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 1998, 2017, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_JNITYPES_AARCH32_HPP
+#define CPU_AARCH32_VM_JNITYPES_AARCH32_HPP
+
+#include "jni.h"
+#include "memory/allocation.hpp"
+#include "oops/oop.hpp"
+
+// This file holds platform-dependent routines used to write primitive jni
+// types to the array of arguments passed into JavaCalls::call
+
+class JNITypes : AllStatic {
+  // These functions write a java primitive type (in native format)
+  // to a java stack slot array to be passed as an argument to JavaCalls:calls.
+  // I.e., they are functionally 'push' operations if they have a 'pos'
+  // formal parameter.  Note that jlong's and jdouble's are written
+  // _in reverse_ of the order in which they appear in the interpreter
+  // stack.  This is because call stubs (see stubGenerator_sparc.cpp)
+  // reverse the argument list constructed by JavaCallArguments (see
+  // javaCalls.hpp).
+
+public:
+  // Ints are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_int(jint  from, intptr_t *to)           { *(jint *)(to +   0  ) =  from; }
+  static inline void    put_int(jint  from, intptr_t *to, int& pos) { *(jint *)(to + pos++) =  from; }
+  static inline void    put_int(jint *from, intptr_t *to, int& pos) { *(jint *)(to + pos++) = *from; }
+
+  // Longs are stored in native format in one JavaCallArgument slot at
+  // *(to+1).
+  /*static inline void put_long(jlong  from, intptr_t *to) {
+    *(jlong*) (to + 1) = from;
+  }
+
+  static inline void put_long(jlong  from, intptr_t *to, int& pos) {
+    *(jlong*) (to + 1 + pos) = from;
+    pos += 2;
+  }
+
+  static inline void put_long(jlong *from, intptr_t *to, int& pos) {
+    *(jlong*) (to + 1 + pos) = *from;
+    pos += 2;
+  }*/
+  static inline void put_long(jlong  from, intptr_t *to) {
+    uint64_t val = from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+    *(jlong*)to = (jlong)val;
+  }
+
+  static inline void put_long(jlong  from, intptr_t *to, int& pos) {
+    uint64_t val = from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+
+    *(jlong*) (to + pos) = (jlong)val;
+    pos += 2;
+  }
+
+  static inline void put_long(jlong *from, intptr_t *to, int& pos) {
+    uint64_t val = *from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+
+    *(jlong*) (to + pos) = (jlong)val;
+    pos += 2;
+  }
+
+
+
+  // Oops are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_obj(oop  from, intptr_t *to)           { *(oop *)(to +   0  ) =  from; }
+  static inline void    put_obj(oop  from, intptr_t *to, int& pos) { *(oop *)(to + pos++) =  from; }
+  static inline void    put_obj(oop *from, intptr_t *to, int& pos) { *(oop *)(to + pos++) = *from; }
+
+  // Floats are stored in native format in one JavaCallArgument slot at *to.
+  static inline void    put_float(jfloat  from, intptr_t *to)           { *(jfloat *)(to +   0  ) =  from;  }
+  static inline void    put_float(jfloat  from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) =  from; }
+  static inline void    put_float(jfloat *from, intptr_t *to, int& pos) { *(jfloat *)(to + pos++) = *from; }
+
+#undef _JNI_SLOT_OFFSET
+#define _JNI_SLOT_OFFSET 1
+  // Doubles are stored in native word format in one JavaCallArgument
+  // slot at *(to+1).
+  /*static inline void put_double(jdouble  from, intptr_t *to) {
+    *(jdouble*) (to + 1) = from;
+  }
+
+  static inline void put_double(jdouble  from, intptr_t *to, int& pos) {
+    *(jdouble*) (to + 1 + pos) = from;
+    pos += 2;
+  }
+
+  static inline void put_double(jdouble *from, intptr_t *to, int& pos) {
+    *(jdouble*) (to + 1 + pos) = *from;
+    pos += 2;
+  }*/
+
+  static inline void put_double(jdouble  from, intptr_t *to) {
+    uint64_t val = *(uint64_t*)&from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+    *(uint64_t*)to = val;
+  }
+
+  static inline void put_double(jdouble  from, intptr_t *to, int& pos) {
+    uint64_t val = *(uint64_t*)&from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+    *(uint64_t*) (to + pos) = val;
+    pos += 2;
+  }
+
+  static inline void put_double(jdouble *from, intptr_t *to, int& pos) {
+    uint64_t val = *(uint64_t*)from;
+    uint64_t mask = (1LL << 32) - 1;
+    val = (val >> 32) | ((val & mask) << 32);
+    *(uint64_t*) (to + pos) = val;
+    pos += 2;
+  }
+
+
+  // The get_xxx routines, on the other hand, actually _do_ fetch
+  // java primitive types from the interpreter stack.
+  // No need to worry about alignment on Intel.
+  static inline jint    get_int   (intptr_t *from) { return *(jint *)   from; }
+  static inline jlong   get_long  (intptr_t *from) { return *(jlong *)  (from + _JNI_SLOT_OFFSET); }
+  static inline oop     get_obj   (intptr_t *from) { return *(oop *)    from; }
+  static inline jfloat  get_float (intptr_t *from) { return *(jfloat *) from; }
+  static inline jdouble get_double(intptr_t *from) { return *(jdouble *)(from + _JNI_SLOT_OFFSET); }
+#undef _JNI_SLOT_OFFSET
+};
+
+#endif // CPU_AARCH32_VM_JNITYPES_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:09.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/jni_aarch32.h	2018-09-25 19:25:09.000000000 +0300
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 1997, 2013, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.  Oracle designates this
+ * particular file as subject to the "Classpath" exception as provided
+ * by Oracle in the LICENSE file that accompanied this code.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ */
+
+#ifndef _JAVASOFT_JNI_MD_H_
+#define _JAVASOFT_JNI_MD_H_
+
+#if defined(SOLARIS) || defined(LINUX) || defined(_ALLBSD_SOURCE)
+
+
+// Note: please do not change these without also changing jni_md.h in the JDK
+// repository
+#ifndef __has_attribute
+  #define __has_attribute(x) 0
+#endif
+#if (defined(__GNUC__) && ((__GNUC__ > 4) || (__GNUC__ == 4) && (__GNUC_MINOR__ > 2))) || __has_attribute(visibility)
+  #define JNIEXPORT     __attribute__((visibility("default")))
+  #define JNIIMPORT     __attribute__((visibility("default")))
+#else
+  #define JNIEXPORT
+  #define JNIIMPORT
+#endif
+
+  #define JNICALL
+  typedef int jint;
+  typedef long long jlong;
+#else
+  #define JNIEXPORT __declspec(dllexport)
+  #define JNIIMPORT __declspec(dllimport)
+  #define JNICALL __stdcall
+
+  typedef int jint;
+  typedef __int64 jlong;
+#endif
+
+typedef signed char jbyte;
+
+#endif /* !_JAVASOFT_JNI_MD_H_ */
--- /dev/null	2018-09-25 19:25:10.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/macroAssembler_aarch32.cpp	2018-09-25 19:25:10.000000000 +0300
@@ -0,0 +1,4941 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include <sys/types.h>
+
+#include "precompiled.hpp"
+#include "jvm.h"
+#include "asm/assembler.hpp"
+#include "asm/assembler.inline.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/cardTable.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "gc/shared/cardTableBarrierSet.hpp"
+#include "interpreter/interpreter.hpp"
+#include "compiler/disassembler.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/accessDecorators.hpp"
+//This ifdef was introduced so a core build can be built
+#ifdef COMPILER2
+#include "opto/compile.hpp"
+#include "opto/node.hpp"
+#endif
+
+#include "runtime/biasedLocking.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/jniHandles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#define STOP(error) stop(error)
+#else
+#define BLOCK_COMMENT(str) block_comment(str)
+#define STOP(error) block_comment(error); stop(error)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// FIXME This is not a nice fix, this constant was in a compiler2 header
+#define MAX_stubs_size_div2 (128 / 2)
+// FIXME END
+
+// Note the corrections in the following three instructions for the PC.
+// All literal modes that use the PC need to have the offset adjusted
+// Patch any kind of instruction; there may be several instructions.
+// Return the total length (in bytes) of the instructions.
+
+int MacroAssembler::pd_patch_instruction_size(address branch, address target) {
+  // Note the corrections
+  int instructions = 1;
+  long offset = target - (branch + 8); // correct for that PC = PC_this + 2 instructions
+  bool add = offset >= 0;
+  unsigned insn = *(unsigned*)branch;
+  int opc = Instruction_aarch32::extract(insn, 27, 24);
+
+  if(0b1010 == opc || 0b1011 == opc) {
+    // Branch or branch with link
+    assert(0 == (offset & 3), "not aligned correctly");
+    Instruction_aarch32::spatch(branch, 23, 0, offset / 4);
+  } else if (0b0011 == opc) {
+    // Movw, Movt or mov, orr, orr, orr
+    // patch up address load to registers (absolute address).
+      instructions = patch_oop(branch, target) / NativeInstruction::arm_insn_sz;
+  } else if (0b010 == (opc >> 1)) {
+    // LDR, LDRB, STR, STRB
+    Instruction_aarch32::patch(branch, 11, 0, uabs(offset));
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b000 == (opc >> 1)) {
+    // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
+    offset = uabs(offset);
+    Instruction_aarch32::patch(branch, 3, 0, offset & 0xf);
+    Instruction_aarch32::patch(branch, 11, 8, offset >> 4);
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b1101 == opc) {
+    // VLDR, VSTR - NOTE VSTR(lit) is deprecated
+    offset = uabs(offset);
+    assert(0 == (offset & 3), "vldr, vstr can't do unaligned access");
+    Instruction_aarch32::patch(branch, 7, 0, offset >> 2);
+    Instruction_aarch32::patch(branch, 23, 23, add);
+  } else if (0b0010 == opc) {
+    // ADR
+    Instruction_aarch32::patch(branch, 11, 0, encode_imm12(uabs(offset)));
+    Instruction_aarch32::patch(branch, 23, 22, add ? 0b10 : 0b01 );
+  } else {
+    ShouldNotReachHere();
+  }
+  // aarch64 had something for polling page load?
+  return instructions * NativeInstruction::arm_insn_sz;
+}
+
+int MacroAssembler::patch_oop(address insn_addr, address o) {
+    unsigned insn = *(unsigned*)insn_addr;
+    int opc = Instruction_aarch32::extract(insn, 27, 21);
+    if(0b0011000 == opc) {
+        //32-bit pointers, formed of a mov and a movt
+        assert(nativeInstruction_at(insn_addr+4)->is_movt(), "wrong insns in patch");
+
+        uint32_t btm = (uint32_t)o & 0xffff;
+        Instruction_aarch32::patch(insn_addr, 19, 16, btm >> 12);
+        Instruction_aarch32::patch(insn_addr, 11, 0, btm & 0xfff);
+        uint32_t top = (uint32_t)o >> 16;
+        Instruction_aarch32::patch(insn_addr + 4, 19, 16, top >> 12);
+        Instruction_aarch32::patch(insn_addr + 4, 11, 0, top & 0xfff);
+        return 2 * NativeInstruction::arm_insn_sz;
+  } else if(0b0011101 == opc) {
+    //Instead 32bit load sequence uses mov, orr, orr, orr
+    assert(nativeInstruction_at(insn_addr+4 )->is_orr(), "wrong insns in patch");
+    assert(nativeInstruction_at(insn_addr+8 )->is_orr(), "wrong insns in patch");
+    assert(nativeInstruction_at(insn_addr+12)->is_orr(), "wrong insns in patch");
+    // FIXME this could carry us outside valid memory
+
+    uint32_t addr = (uint32_t)o;
+    Instruction_aarch32::patch(insn_addr + 0,  11, 0, (0b0000 << 8) | ((addr >>  0) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 4,  11, 0, (0b1100 << 8) | ((addr >>  8) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 8,  11, 0, (0b1000 << 8) | ((addr >> 16) & 0xff));
+    Instruction_aarch32::patch(insn_addr + 12, 11, 0, (0b0100 << 8) | ((addr >> 24) & 0xff));
+    return 4 * NativeInstruction::arm_insn_sz;
+  } else {
+    ShouldNotReachHere();
+  }
+  return 0; //won't reach here
+}
+
+address MacroAssembler::target_addr_for_insn(address insn_addr, unsigned insn) {
+  long offset = 0;
+  int opc = Instruction_aarch32::extract(insn, 27, 24);
+
+  if(0b1010 == opc || 0b1011 == opc) {
+    // Branch or branch with link
+    offset = Instruction_aarch32::sextract(insn, 23, 0) * 4;
+  } else if (0b0011 == opc) {
+    unsigned *insn_buf = (unsigned*)insn_addr;
+    int opc2 = Instruction_aarch32::extract(insn, 23, 21);
+    if(0b000 == opc2) {
+      // movw, movt (only on newer ARMs)
+      assert(nativeInstruction_at(&insn_buf[1])->is_movt(), "wrong insns in patch");
+      uint32_t addr;
+      addr  = Instruction_aarch32::extract(insn_buf[1], 19, 16) << 28;
+      addr |= Instruction_aarch32::extract(insn_buf[1], 11, 0) << 16;
+      addr |= Instruction_aarch32::extract(insn_buf[0], 19, 16) << 12;
+      addr |= Instruction_aarch32::extract(insn_buf[0], 11, 0);
+      return address(addr);
+    } else if(0b101 == opc2) {
+      // mov, orr, orr, orr
+      assert(nativeInstruction_at(&insn_buf[1])->is_orr(), "wrong insns in patch");
+      assert(nativeInstruction_at(&insn_buf[2])->is_orr(), "wrong insns in patch");
+      assert(nativeInstruction_at(&insn_buf[3])->is_orr(), "wrong insns in patch");
+      uint32_t addr;
+      // TODO Check that the rotations are in the expected order.
+      addr  = Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[0], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[1], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[2], 11, 0));
+      addr |= Assembler::decode_imm12(Instruction_aarch32::extract(insn_buf[3], 11, 0));
+      return address(addr);
+    } else {
+      ShouldNotReachHere();
+    }
+  } else if (0b010 == (opc >> 1)) {
+    // LDR, LDRB, STR, STRB
+    offset = Instruction_aarch32::extract(insn, 11, 0);
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b000 == (opc >> 1)) {
+    // LDRH, LDRSH, LDRSB, LDRD, STRH, STRD
+    offset = Instruction_aarch32::extract(insn, 3, 0);
+    offset |= Instruction_aarch32::extract(insn, 11, 8) << 4;
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b1101 == opc) {
+    // VLDR, VSTR - NOTE VSTR(lit) is deprecated
+    offset = Instruction_aarch32::extract(insn, 7, 0) << 2;
+    bool add = Instruction_aarch32::extract(insn, 23, 23);
+    offset = add ? offset : -offset;
+  } else if (0b0010 == opc) {
+    // ADR
+    offset = decode_imm12(Instruction_aarch32::extract(insn, 11, 0));
+    int code = Instruction_aarch32::extract(insn, 23, 22);
+    switch(code) {
+      case 0b01: offset = -offset; break;
+      case 0b10:                   break;
+      default: ShouldNotReachHere();
+    }
+  } else {
+    ShouldNotReachHere();
+  }
+  //Correct offset for PC
+  offset += 8;
+  return address(((uint32_t)insn_addr + offset));
+}
+
+
+void MacroAssembler::serialize_memory(Register thread, Register tmp) {
+  dmb(Assembler::ISH);
+}
+
+void MacroAssembler::safepoint_poll(Label& slow_path) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    ldr(rscratch1, Address(rthread, Thread::polling_page_offset()));
+    tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
+  } else {
+    mov(rscratch1, ExternalAddress(SafepointSynchronize::address_of_state()));
+    ldr(rscratch1, Address(rscratch1));
+    cmp(rscratch1, SafepointSynchronize::_not_synchronized);
+    b(slow_path, Assembler::NE);
+  }
+}
+
+// Just like safepoint_poll, but use an acquiring load for thread-
+// local polling.
+//
+// We need an acquire here to ensure that any subsequent load of the
+// global SafepointSynchronize::_state flag is ordered after this load
+// of the local Thread::_polling page.  We don't want this poll to
+// return false (i.e. not safepointing) and a later poll of the global
+// SafepointSynchronize::_state spuriously to return true.
+//
+// This is to avoid a race when we're in a native->Java transition
+// racing the code which wakes up from a safepoint.
+//
+void MacroAssembler::safepoint_poll_acquire(Label& slow_path) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    lea(rscratch1, Address(rthread, Thread::polling_page_offset()));
+    ldr(rscratch1, rscratch1);
+    dmb(Assembler::ISH);
+    tbnz(rscratch1, exact_log2(SafepointMechanism::poll_bit()), slow_path);
+  } else {
+    safepoint_poll(slow_path);
+  }
+}
+
+void MacroAssembler::reset_last_Java_frame(bool clear_fp) {
+  mov(rscratch1, 0);
+  // we must set sp to zero to clear frame
+  str(rscratch1, Address(rthread, JavaThread::last_Java_sp_offset()));
+  // must clear fp, so that compiled frames are not confused; it is
+  // possible that we need it only for debugging
+  if (clear_fp) {
+    str(rscratch1, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+
+  // Always clear the pc because it could have been set by make_walkable()
+  str(rscratch1, Address(rthread, JavaThread::last_Java_pc_offset()));
+}
+
+// Calls to C land
+//
+// When entering C land, the rfp & sp of the last Java frame have to be recorded
+// in the (thread-local) JavaThread object. When leaving C land, the last Java fp
+// has to be reset to 0. This is required to allow proper stack traversal.
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Register last_java_pc,
+                                         Register scratch) {
+
+  if (last_java_pc->is_valid()) {
+      str(last_java_pc, Address(rthread,
+                                JavaThread::frame_anchor_offset()
+                                + JavaFrameAnchor::last_Java_pc_offset()));
+    }
+
+  // determine last_java_sp register
+  if (last_java_sp == sp) {
+    mov(scratch, sp);
+    last_java_sp = scratch;
+  } else if (!last_java_sp->is_valid()) {
+    last_java_sp = sp;
+  }
+
+  str(last_java_sp, Address(rthread, JavaThread::last_Java_sp_offset()));
+
+  // last_java_fp is optional
+  if (last_java_fp->is_valid()) {
+    str(last_java_fp, Address(rthread, JavaThread::last_Java_fp_offset()));
+  }
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         address  last_java_pc,
+                                         Register scratch) {
+  if (last_java_pc != NULL) {
+    adr(scratch, last_java_pc);
+  } else {
+    // FIXME: This is almost never correct.  We should delete all
+    // cases of set_last_Java_frame with last_java_pc=NULL and use the
+    // correct return address instead.
+    adr(scratch, pc());
+  }
+
+  str(scratch, Address(rthread,
+                       JavaThread::frame_anchor_offset()
+                       + JavaFrameAnchor::last_Java_pc_offset()));
+
+  set_last_Java_frame(last_java_sp, last_java_fp, noreg, scratch);
+}
+
+void MacroAssembler::set_last_Java_frame(Register last_java_sp,
+                                         Register last_java_fp,
+                                         Label &L,
+                                         Register scratch) {
+  if (L.is_bound()) {
+    set_last_Java_frame(last_java_sp, last_java_fp, target(L), scratch);
+  } else {
+    InstructionMark im(this);
+    L.add_patch_at(code(), locator());
+    set_last_Java_frame(last_java_sp, last_java_fp, (address)NULL, scratch);
+  }
+}
+
+void MacroAssembler::far_call(Address entry, CodeBuffer *cbuf) {
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    lea(lr, entry);
+    if (cbuf) cbuf->set_insts_mark();
+    bl(lr);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    bl(entry);
+  }
+}
+
+void MacroAssembler::far_jump(Address entry, CodeBuffer *cbuf, Register tmp) {
+  assert(CodeCache::find_blob(entry.target()) != NULL,
+         "destination of far call not found in code cache");
+  if (far_branches()) {
+    lea(tmp, entry);
+    if (cbuf) cbuf->set_insts_mark();
+    b(tmp);
+  } else {
+    if (cbuf) cbuf->set_insts_mark();
+    b(entry);
+  }
+}
+
+void MacroAssembler::reserved_stack_check() {
+    // testing if reserved zone needs to be enabled
+    Label no_reserved_zone_enabling;
+
+    ldr(rscratch1, Address(rthread, JavaThread::reserved_stack_activation_offset()));
+    cmp(sp, rscratch1);
+    b(no_reserved_zone_enabling, Assembler::LO);
+
+    enter();   // LR and FP are live.
+    lea(rscratch1, CAST_FROM_FN_PTR(address, SharedRuntime::enable_stack_reserved_zone));
+    mov(c_rarg0, rthread);
+    bl(rscratch1);
+    leave();
+
+    // We have already removed our own frame.
+    // throw_delayed_StackOverflowError will think that it's been
+    // called by our caller.
+    lea(rscratch1, RuntimeAddress(StubRoutines::throw_delayed_StackOverflowError_entry()));
+    b(rscratch1);
+    should_not_reach_here();
+
+    bind(no_reserved_zone_enabling);
+}
+
+int MacroAssembler::biased_locking_enter(Register obj_reg,
+                                         Register swap_reg,
+                                         Register tmp_reg,
+                                         Register tmp_reg2,
+                                         bool swap_reg_contains_mark,
+                                         Label& done,
+                                         Label* slow_case,
+                                         BiasedLockingCounters* counters) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  if (PrintBiasedLockingStatistics && counters == NULL)
+    counters = BiasedLocking::counters();
+
+  assert(tmp_reg != noreg, "must be real register");
+  assert_different_registers(obj_reg, swap_reg, tmp_reg, tmp_reg2);
+  assert(markOopDesc::age_shift == markOopDesc::lock_bits + markOopDesc::biased_lock_bits, "biased locking makes assumptions about bit layout");
+  Address mark_addr      (obj_reg, oopDesc::mark_offset_in_bytes());
+  Address klass_addr     (obj_reg, oopDesc::klass_offset_in_bytes());
+
+  // Biased locking
+  // See whether the lock is currently biased toward our thread and
+  // whether the epoch is still valid
+  // Note that the runtime guarantees sufficient alignment of JavaThread
+  // pointers to allow age to be placed into low bits
+  // First check to see whether biasing is even enabled for this object
+  Label cas_label;
+  int null_check_offset = -1;
+  if (!swap_reg_contains_mark) {
+    null_check_offset = offset();
+    ldr(swap_reg, mark_addr);
+  }
+  andr(tmp_reg, swap_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(tmp_reg, markOopDesc::biased_lock_pattern);
+  b(cas_label, Assembler::NE);
+  // The bias pattern is present in the object's header. Need to check
+  // whether the bias owner and the epoch are both still current.
+  load_prototype_header(tmp_reg, obj_reg);
+  orr(tmp_reg, tmp_reg, rthread);
+  eor(tmp_reg, swap_reg, tmp_reg);
+//  andr(tmp_reg, tmp_reg, ~((int) markOopDesc::age_mask_in_place));
+  bic(tmp_reg, tmp_reg, markOopDesc::age_mask_in_place);
+  if (counters != NULL) {
+    Label around;
+    cbnz(tmp_reg, around);
+    atomic_inc(Address((address)counters->biased_lock_entry_count_addr()), tmp_reg, tmp_reg2);
+    b(done);
+    bind(around);
+  } else {
+    cbz(tmp_reg, done);
+  }
+
+  Label try_revoke_bias;
+  Label try_rebias;
+
+  // At this point we know that the header has the bias pattern and
+  // that we are not the bias owner in the current epoch. We need to
+  // figure out more details about the state of the header in order to
+  // know what operations can be legally performed on the object's
+  // header.
+
+  // If the low three bits in the xor result aren't clear, that means
+  // the prototype header is no longer biased and we have to revoke
+  // the bias on this object.
+  andr(tmp_reg2, tmp_reg, markOopDesc::biased_lock_mask_in_place);
+  cbnz(tmp_reg2, try_revoke_bias);
+
+  // Biasing is still enabled for this data type. See whether the
+  // epoch of the current bias is still valid, meaning that the epoch
+  // bits of the mark word are equal to the epoch bits of the
+  // prototype header. (Note that the prototype header's epoch bits
+  // only change at a safepoint.) If not, attempt to rebias the object
+  // toward the current thread. Note that we must be absolutely sure
+  // that the current epoch is invalid in order to do this because
+  // otherwise the manipulations it performs on the mark word are
+  // illegal.
+  andr(tmp_reg2, tmp_reg, markOopDesc::epoch_mask_in_place);
+  cbnz(tmp_reg2, try_rebias);
+
+  // The epoch of the current bias is still valid but we know nothing
+  // about the owner; it might be set or it might be clear. Try to
+  // acquire the bias of the object using an atomic operation. If this
+  // fails we will go in to the runtime to revoke the object's bias.
+  // Note that we first construct the presumed unbiased header so we
+  // don't accidentally blow away another thread's valid bias.
+  {
+    Label here;
+    mov(tmp_reg2, markOopDesc::biased_lock_mask_in_place | markOopDesc::age_mask_in_place | markOopDesc::epoch_mask_in_place);
+    andr(swap_reg, swap_reg, tmp_reg2);
+    orr(tmp_reg, swap_reg, rthread);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
+    // If the biasing toward our thread failed, this means that
+    // another thread succeeded in biasing it toward itself and we
+    // need to revoke that bias. The revocation will occur in the
+    // interpreter runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->anonymously_biased_lock_entry_count_addr()),
+                  tmp_reg, tmp_reg2);
+    }
+  }
+  b(done);
+
+  bind(try_rebias);
+  // At this point we know the epoch has expired, meaning that the
+  // current "bias owner", if any, is actually invalid. Under these
+  // circumstances _only_, we are allowed to use the current header's
+  // value as the comparison value when doing the cas to acquire the
+  // bias in the current epoch. In other words, we allow transfer of
+  // the bias from one thread to another directly in this situation.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here;
+    load_prototype_header(tmp_reg, obj_reg);
+    orr(tmp_reg, rthread, tmp_reg);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, slow_case);
+    // If the biasing toward our thread failed, then another thread
+    // succeeded in biasing it toward itself and we need to revoke that
+    // bias. The revocation will occur in the runtime in the slow case.
+    bind(here);
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->rebiased_lock_entry_count_addr()),
+                  tmp_reg, tmp_reg2);
+    }
+  }
+  b(done);
+
+  bind(try_revoke_bias);
+  // The prototype mark in the klass doesn't have the bias bit set any
+  // more, indicating that objects of this data type are not supposed
+  // to be biased any more. We are going to try to reset the mark of
+  // this object to the prototype value and fall through to the
+  // CAS-based locking scheme. Note that if our CAS fails, it means
+  // that another thread raced us for the privilege of revoking the
+  // bias of this particular object, so it's okay to continue in the
+  // normal locking code.
+  //
+  // FIXME: due to a lack of registers we currently blow away the age
+  // bits in this situation. Should attempt to preserve them.
+  {
+    Label here, nope;
+    load_prototype_header(tmp_reg, obj_reg);
+    cmpxchg_obj_header(swap_reg, tmp_reg, obj_reg, tmp_reg2, here, &nope);
+    bind(here);
+
+    // Fall through to the normal CAS-based lock, because no matter what
+    // the result of the above CAS, some thread must have succeeded in
+    // removing the bias bit from the object's header.
+    if (counters != NULL) {
+      atomic_inc(Address((address)counters->revoked_lock_entry_count_addr()), tmp_reg,
+                  tmp_reg2);
+    }
+    bind(nope);
+  }
+
+  bind(cas_label);
+
+  return null_check_offset;
+}
+
+void MacroAssembler::biased_locking_exit(Register obj_reg, Register temp_reg, Label& done) {
+  assert(UseBiasedLocking, "why call this otherwise?");
+
+  // Check for biased locking unlock case, which is a no-op
+  // Note: we do not have to check the thread ID for two reasons.
+  // First, the interpreter checks for IllegalMonitorStateException at
+  // a higher level. Second, if the bias was revoked while we held the
+  // lock, the object could not be rebiased toward another thread, so
+  // the bias bit would be clear.
+  ldr(temp_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+  andr(temp_reg, temp_reg, markOopDesc::biased_lock_mask_in_place);
+  cmp(temp_reg, markOopDesc::biased_lock_pattern);
+  b(done, Assembler::EQ);
+}
+
+
+static void pass_arg0(MacroAssembler* masm, Register arg) {
+  if (c_rarg0 != arg ) {
+    masm->mov(c_rarg0, arg);
+  }
+}
+
+static void pass_arg1(MacroAssembler* masm, Register arg) {
+  if (c_rarg1 != arg ) {
+    masm->mov(c_rarg1, arg);
+  }
+}
+
+static void pass_arg2(MacroAssembler* masm, Register arg) {
+  if (c_rarg2 != arg ) {
+    masm->mov(c_rarg2, arg);
+  }
+}
+
+static void pass_arg3(MacroAssembler* masm, Register arg) {
+  if (c_rarg3 != arg ) {
+    masm->mov(c_rarg3, arg);
+  }
+}
+
+void MacroAssembler::call_VM_base(Register oop_result,
+                                  Register java_thread,
+                                  Register last_java_sp,
+                                  address  entry_point,
+                                  int      number_of_arguments,
+                                  bool     check_exceptions) {
+   // determine java_thread register
+  if (!java_thread->is_valid()) {
+    java_thread = rthread;
+  }
+
+  // determine last_java_sp register
+  if (!last_java_sp->is_valid()) {
+    last_java_sp = sp;
+  }
+
+  // debugging support
+  assert(number_of_arguments >= 0   , "cannot have negative number of arguments");
+  assert(java_thread == rthread, "unexpected register");
+
+  assert(java_thread != oop_result  , "cannot use the same register for java_thread & oop_result");
+  assert(java_thread != last_java_sp, "cannot use the same register for java_thread & last_java_sp");
+
+  // push java thread (becomes first argument of C function)
+
+  mov(c_rarg0, java_thread);
+
+  // set last Java frame before call
+  assert(last_java_sp != rfp, "can't use rfp");
+
+  Label l;
+  set_last_Java_frame(last_java_sp, rfp, l, rscratch2);
+
+
+  // FIXME - Can save lr in more elegant way ?
+  //str(lr, pre(sp, -wordSize));
+
+  // do the call, remove parameters
+  MacroAssembler::call_VM_leaf_base(entry_point, number_of_arguments, &l);
+
+  //ldr(lr, post(sp, wordSize));
+
+  // reset last Java frame
+  // Only interpreter should have to clear fp
+  reset_last_Java_frame(true);
+
+   // C++ interp handles this in the interpreter
+  check_and_handle_popframe(java_thread);
+  check_and_handle_earlyret(java_thread);
+
+  if (check_exceptions) {
+    // check for pending exceptions (java_thread is set upon return)
+    ldr(rscratch2, Address(java_thread, in_bytes(Thread::pending_exception_offset())));
+    Label ok;
+    cbz(rscratch2, ok);
+
+    lea(rscratch2, RuntimeAddress(StubRoutines::forward_exception_entry()));
+    // forward_exception uses LR to choose exception handler but LR is trashed by previous code
+    // since we used to get here from interpreted code BL is acceptable way to acquire correct LR (see StubGenerator::generate_forward_exception)
+    bl(rscratch2);
+    bind(ok);
+  }
+
+  // get oop result if there is one and reset the value in the thread
+  if (oop_result->is_valid()) {
+    get_vm_result(oop_result, java_thread);
+  }
+}
+
+void MacroAssembler::call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions) {
+  call_VM_base(oop_result, noreg, noreg, entry_point, number_of_arguments, check_exceptions);
+}
+
+// Maybe emit a call via a trampoline.  If the code cache is small
+// trampolines won't be emitted.
+
+void MacroAssembler::trampoline_call(Address entry, CodeBuffer *cbuf) {
+  assert(JavaThread::current()->is_Compiler_thread(), "just checking");
+  assert(entry.rspec().type() == relocInfo::runtime_call_type
+         || entry.rspec().type() == relocInfo::opt_virtual_call_type
+         || entry.rspec().type() == relocInfo::static_call_type
+         || entry.rspec().type() == relocInfo::virtual_call_type, "wrong reloc type");
+
+  if (cbuf) {
+    cbuf->set_insts_mark();
+  }
+
+  if (far_branches()) {
+    // Have make trampoline such way: destination address should be raw 4 byte value,
+    // so it's patching could be done atomically.
+    relocate(entry.rspec());
+    address start = pc();
+    add(lr, r15_pc, NativeCall::instruction_size - 2 * NativeInstruction::arm_insn_sz);
+    ldr(r15_pc, Address(r15_pc, 4));
+    emit_int32((uintptr_t) entry.target());
+    // possibly pad the call to the NativeCall size to make patching happy
+    while (pc() - start < NativeCall::instruction_size) {
+      nop();
+    }
+    assert(pc() - start == NativeCall::instruction_size, "fix NativeTrampolineCall::instruction_size!");
+  } else {
+    bl(entry);
+  }
+}
+
+void MacroAssembler::c2bool(Register x) {
+  ands(r0, r0, 0xff);
+  mov(r0, 1, Assembler::NE);
+}
+
+void MacroAssembler::ic_call(address entry, jint method_index) {
+  RelocationHolder rh = virtual_call_Relocation::spec(pc(), method_index);
+  // address const_ptr = long_constant((jlong)Universe::non_oop_word());
+  // unsigned long offset;
+  // ldr_constant(rscratch2, const_ptr);
+  movptr(rscratch2, (uintptr_t)Universe::non_oop_word());
+  trampoline_call(Address(entry, rh));
+}
+
+// Implementation of call_VM versions
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             bool check_exceptions) {
+  call_VM_helper(oop_result, entry_point, 0, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+
+  pass_arg1(this, arg_1);
+  call_VM_helper(oop_result, entry_point, 3, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             int number_of_arguments,
+                             bool check_exceptions) {
+  call_VM_base(oop_result, rthread, last_java_sp, entry_point, number_of_arguments, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             bool check_exceptions) {
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 1, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             bool check_exceptions) {
+
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 2, check_exceptions);
+}
+
+void MacroAssembler::call_VM(Register oop_result,
+                             Register last_java_sp,
+                             address entry_point,
+                             Register arg_1,
+                             Register arg_2,
+                             Register arg_3,
+                             bool check_exceptions) {
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  pass_arg1(this, arg_1);
+  call_VM(oop_result, last_java_sp, entry_point, 3, check_exceptions);
+}
+
+
+void MacroAssembler::get_vm_result(Register oop_result, Register java_thread) {
+  ldr(oop_result, Address(java_thread, JavaThread::vm_result_offset()));
+  assert(oop_result != rscratch2, "can't be");
+  mov(rscratch2, 0);
+  str(rscratch2, Address(java_thread, JavaThread::vm_result_offset()));
+  verify_oop(oop_result, "broken oop in call_VM_base");
+}
+
+void MacroAssembler::get_vm_result_2(Register metadata_result, Register java_thread) {
+  ldr(metadata_result, Address(java_thread, JavaThread::vm_result_2_offset()));
+  assert(metadata_result != rscratch2 &&
+         java_thread != rscratch2, "can't be");
+  mov(rscratch2, 0);
+  str(rscratch2, Address(java_thread, JavaThread::vm_result_2_offset()));
+}
+
+void MacroAssembler::align(int modulus) {
+  while (offset() % modulus != 0) nop();
+}
+
+// these are no-ops overridden by InterpreterMacroAssembler
+
+void MacroAssembler::check_and_handle_earlyret(Register java_thread) { }
+
+void MacroAssembler::check_and_handle_popframe(Register java_thread) { }
+
+
+RegisterOrConstant MacroAssembler::delayed_value_impl(intptr_t* delayed_value_addr,
+                                                      Register tmp,
+                                                      int offset) {
+  intptr_t value = *delayed_value_addr;
+  if (value != 0)
+    return RegisterOrConstant(value + offset);
+
+  // load indirectly to solve generation ordering problem
+  ldr(tmp, ExternalAddress((address) delayed_value_addr));
+
+  if (offset != 0)
+    add(tmp, tmp, offset);
+
+  return RegisterOrConstant(tmp);
+}
+
+
+// Look up the method for a megamorphic invokeinterface call.
+// The target method is determined by <intf_klass, itable_index>.
+// The receiver klass is in recv_klass.
+// On success, the result will be in method_result, and execution falls through.
+// On failure, execution transfers to the given label.
+void MacroAssembler::lookup_interface_method(Register recv_klass,
+                                             Register intf_klass,
+                                             RegisterOrConstant itable_index,
+                                             Register method_result,
+                                             Register scan_temp,
+                                             Label& L_no_such_interface,
+                                             bool return_method) {
+  assert_different_registers(recv_klass, intf_klass, scan_temp);
+  assert_different_registers(method_result, intf_klass, scan_temp);
+  assert(recv_klass != method_result || !return_method,
+         "recv_klass can be destroyed when method isn't needed");
+
+  // Compute start of first itableOffsetEntry (which is at the end of the vtable)
+  int vtable_base = in_bytes(InstanceKlass::vtable_start_offset());
+  int itentry_off = itableMethodEntry::method_offset_in_bytes();
+  int scan_step   = itableOffsetEntry::size() * wordSize;
+  int vte_size    = vtableEntry::size_in_bytes();
+  assert(vte_size == wordSize, "else adjust times_vte_scale");
+
+  ldr(scan_temp, Address(recv_klass, in_bytes(InstanceKlass::vtable_length_offset())));
+
+  // %%% Could store the aligned, prescaled offset in the klassoop.
+  // lea(scan_temp, Address(recv_klass, scan_temp, times_vte_scale, vtable_base));
+  lea(scan_temp, Address(recv_klass, scan_temp, lsl(2)));
+  add(scan_temp, scan_temp, vtable_base);
+
+  if (return_method) {
+    // Adjust recv_klass by scaled itable_index, so we can free itable_index.
+    assert(itableMethodEntry::size() * wordSize == wordSize, "adjust the scaling in the code below");
+    // lea(recv_klass, Address(recv_klass, itable_index, Address::times_ptr, itentry_off));
+    lea(recv_klass, itable_index.is_register() ?
+            Address(recv_klass, itable_index, lsl(2)) :
+            Address(recv_klass, itable_index.as_constant() << 2));
+    if (itentry_off)
+      add(recv_klass, recv_klass, itentry_off);
+  }
+
+  // for (scan = klass->itable(); scan->interface() != NULL; scan += scan_step) {
+  //   if (scan->interface() == intf) {
+  //     result = (klass + scan->offset() + itable_index);
+  //   }
+  // }
+  Label search, found_method;
+
+  for (int peel = 1; peel >= 0; peel--) {
+    ldr(method_result, Address(scan_temp, itableOffsetEntry::interface_offset_in_bytes()));
+    cmp(intf_klass, method_result);
+
+    if (peel) {
+      b(found_method, Assembler::EQ);
+    } else {
+      b(search, Assembler::NE);
+      // (invert the test to fall through to found_method...)
+    }
+
+    if (!peel)  break;
+
+    bind(search);
+
+    // Check that the previous entry is non-null.  A null entry means that
+    // the receiver class doesn't implement the interface, and wasn't the
+    // same as when the caller was compiled.
+    cbz(method_result, L_no_such_interface);
+    add(scan_temp, scan_temp, scan_step);
+  }
+
+  bind(found_method);
+
+  if (return_method) {
+    // Got a hit.
+    ldr(scan_temp, Address(scan_temp, itableOffsetEntry::offset_offset_in_bytes()));
+    ldr(method_result, Address(recv_klass, scan_temp));
+  }
+}
+
+// virtual method calling
+void MacroAssembler::lookup_virtual_method(Register recv_klass,
+                                           RegisterOrConstant vtable_index,
+                                           Register method_result) {
+  const int base = in_bytes(InstanceKlass::vtable_start_offset());
+  int vtable_offset_in_bytes = base + vtableEntry::method_offset_in_bytes();
+  if (vtable_index.is_register()) {
+    lea(method_result, Address(recv_klass,
+                               vtable_index.as_register(),
+                               lsl(LogBytesPerWord)));
+    ldr(method_result, Address(method_result, vtable_offset_in_bytes));
+  } else {
+    vtable_offset_in_bytes += vtable_index.as_constant() * wordSize;
+    if(is_valid_for_offset_imm(vtable_offset_in_bytes, 12)) {
+      ldr(method_result, Address(recv_klass, vtable_offset_in_bytes));
+    } else {
+      mov(method_result, vtable_offset_in_bytes);
+      ldr(method_result, Address(recv_klass, method_result));
+    }
+  }
+}
+
+void MacroAssembler::check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success) {
+  Label L_failure;
+  check_klass_subtype_fast_path(sub_klass, super_klass, temp_reg,        &L_success, &L_failure, NULL);
+  check_klass_subtype_slow_path(sub_klass, super_klass, temp_reg, noreg, &L_success, NULL);
+  bind(L_failure);
+}
+
+
+void MacroAssembler::check_klass_subtype_fast_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   Label* L_slow_path,
+                                        RegisterOrConstant super_check_offset) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  bool must_load_sco = (super_check_offset.constant_or_zero() == -1);
+  if (super_check_offset.is_register()) {
+    assert_different_registers(sub_klass, super_klass,
+                               super_check_offset.as_register());
+  } else if (must_load_sco) {
+    assert(temp_reg != noreg, "supply either a temp or a register offset");
+  }
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  if (L_slow_path == NULL) { L_slow_path = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  int sco_offset = in_bytes(Klass::super_check_offset_offset());
+  Address super_check_offset_addr(super_klass, sco_offset);
+
+  // Hacked jmp, which may only be used just before L_fallthrough.
+#define final_jmp(label)                                                \
+  if (&(label) == &L_fallthrough) { /*do nothing*/ }                    \
+  else                            b(label)                /*omit semi*/
+
+  // If the pointers are equal, we are done (e.g., String[] elements).
+  // This self-check enables sharing of secondary supertype arrays among
+  // non-primary types such as array-of-interface.  Otherwise, each such
+  // type would need its own customized SSA.
+  // We move this check to the front of the fast path because many
+  // type checks are in fact trivially successful in this manner,
+  // so we get a nicely predicted branch right at the start of the check.
+  cmp(sub_klass, super_klass);
+  b(*L_success, Assembler::EQ);
+
+  // Check the supertype display:
+  if (must_load_sco) {
+    ldr(temp_reg, super_check_offset_addr);
+    super_check_offset = RegisterOrConstant(temp_reg);
+  }
+  Address super_check_addr(sub_klass, super_check_offset);
+  ldr(rscratch1, super_check_addr);
+  cmp(super_klass, rscratch1); // load displayed supertype
+
+  // This check has worked decisively for primary supers.
+  // Secondary supers are sought in the super_cache ('super_cache_addr').
+  // (Secondary supers are interfaces and very deeply nested subtypes.)
+  // This works in the same check above because of a tricky aliasing
+  // between the super_cache and the primary super display elements.
+  // (The 'super_check_addr' can address either, as the case requires.)
+  // Note that the cache is updated below if it does not help us find
+  // what we need immediately.
+  // So if it was a primary super, we can just fail immediately.
+  // Otherwise, it's the slow path for us (no success at this point).
+
+  if (super_check_offset.is_register()) {
+    b(*L_success, Assembler::EQ);
+    cmp(super_check_offset.as_register(), sc_offset);
+    if (L_failure == &L_fallthrough) {
+      b(*L_slow_path, Assembler::EQ);
+    } else {
+      b(*L_failure, Assembler::NE);
+      final_jmp(*L_slow_path);
+    }
+  } else if (super_check_offset.as_constant() == sc_offset) {
+    // Need a slow path; fast failure is impossible.
+    if (L_slow_path == &L_fallthrough) {
+      b(*L_success, Assembler::EQ);
+    } else {
+      b(*L_slow_path, Assembler::NE);
+      final_jmp(*L_success);
+    }
+  } else {
+    // No slow path; it's a fast decision.
+    if (L_failure == &L_fallthrough) {
+      b(*L_success, Assembler::EQ);
+    } else {
+      b(*L_failure, Assembler::NE);
+      final_jmp(*L_success);
+    }
+  }
+
+  bind(L_fallthrough);
+
+#undef final_jmp
+}
+
+// These two are taken from x86, but they look generally useful
+
+// scans count pointer sized words at [addr] for occurence of value,
+// generic
+void MacroAssembler::repne_scan(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label loop, fail, found;
+  cmp(count, 0);
+  b(fail, EQ);
+
+  bind(loop);
+  ldr(scratch, post(addr, wordSize));
+  cmp(value, scratch);
+  b(found, EQ);
+  subs(count, count, 1);
+  b(loop, NE);
+
+  bind(fail);
+  cmp(sp, 0); // sp never zero
+  bind(found);
+}
+
+// Form an address from base + offset in Rd.  Rd may or may
+// not actually be used: you must use the Address that is returned.
+// It is up to you to ensure that the shift provided matches the size
+// of your data.
+Address MacroAssembler::form_address(Register Rd, Register base, long byte_offset, int shift) {
+  // form_address result should only be used together with ldr/str instructions
+  // otherwise please provide exact type instead of IDT_INT or apply safe_for()
+  if (Address::offset_ok_for_immed(byte_offset, Address::IDT_INT))
+    // It fits; no need for any heroics
+    return Address(base, byte_offset);
+
+  // See if we can do this with two 12-bit offsets
+  {
+    unsigned long masked_offset = byte_offset & ~0xfff;
+    if (Address::offset_ok_for_immed(byte_offset - masked_offset, Address::IDT_INT)
+        && Assembler::operand_valid_for_add_sub_immediate(masked_offset)) {
+      add(Rd, base, masked_offset);
+      byte_offset -= masked_offset;
+      return Address(Rd, byte_offset);
+    }
+  }
+
+  // Do it the hard way
+  mov(Rd, byte_offset);
+  add(Rd, base, Rd);
+  return Address(Rd);
+}
+
+// scans count 4 byte words at [addr] for occurence of value,
+// generic
+/*void MacroAssembler::repne_scanw(Register addr, Register value, Register count,
+                                Register scratch) {
+  Label Lloop, Lexit;
+  cbz(count, Lexit);
+  bind(Lloop);
+  ldr(scratch, post(addr, wordSize));
+  cmp(value, scratch);
+  b(Lexit, EQ);
+  sub(count, count, 1);
+  cbnz(count, Lloop);
+  bind(Lexit);
+}*/
+
+void MacroAssembler::check_klass_subtype_slow_path(Register sub_klass,
+                                                   Register super_klass,
+                                                   Register temp_reg,
+                                                   Register temp2_reg,
+                                                   Label* L_success,
+                                                   Label* L_failure,
+                                                   bool set_cond_codes) {
+  assert_different_registers(sub_klass, super_klass, temp_reg);
+  if (temp2_reg != noreg)
+    assert_different_registers(sub_klass, super_klass, temp_reg, temp2_reg, rscratch1);
+#define IS_A_TEMP(reg) ((reg) == temp_reg || (reg) == temp2_reg)
+
+  Label L_fallthrough;
+  int label_nulls = 0;
+  if (L_success == NULL)   { L_success   = &L_fallthrough; label_nulls++; }
+  if (L_failure == NULL)   { L_failure   = &L_fallthrough; label_nulls++; }
+  assert(label_nulls <= 1, "at most one NULL in the batch");
+
+  // a couple of useful fields in sub_klass:
+  int ss_offset = in_bytes(Klass::secondary_supers_offset());
+  int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+  Address secondary_supers_addr(sub_klass, ss_offset);
+  Address super_cache_addr(     sub_klass, sc_offset);
+
+  BLOCK_COMMENT("check_klass_subtype_slow_path");
+
+  // Do a linear scan of the secondary super-klass chain.
+  // This code is rarely used, so simplicity is a virtue here.
+  // The repne_scan instruction uses fixed registers, which we must spill.
+  // Don't worry too much about pre-existing connections with the input regs.
+
+  assert(sub_klass != r0, "killed reg"); // killed by mov(r0, super)
+  assert(sub_klass != r2, "killed reg"); // killed by lea(r2, &pst_counter)
+
+  RegSet pushed_registers;
+  if (!IS_A_TEMP(r2))    pushed_registers += r2;
+  if (!IS_A_TEMP(r14))    pushed_registers += r14;
+
+  if (super_klass != r0) {
+    if (!IS_A_TEMP(r0))   pushed_registers += r0;
+  }
+
+  push(pushed_registers, sp);
+
+  // Get super_klass value into r0 (even if it was in r5 or r2).
+  if (super_klass != r0) {
+    mov(r0, super_klass);
+  }
+
+#ifndef PRODUCT
+  mov(rscratch2, (address)&SharedRuntime::_partial_subtype_ctr);
+  Address pst_counter_addr(rscratch2);
+  ldr(rscratch1, pst_counter_addr);
+  add(rscratch1, rscratch1, 1);
+  str(rscratch1, pst_counter_addr);
+#endif //PRODUCT
+
+  // We will consult the secondary-super array.
+  ldr(r14, secondary_supers_addr);
+  // Load the array length.
+  ldr(r2, Address(r14, Array<Klass*>::length_offset_in_bytes()));
+  // Skip to start of data.
+  add(r14, r14, Array<Klass*>::base_offset_in_bytes());
+
+  cmp(sp, 0); // Clear Z flag; SP is never zero
+  // Scan R2 words at [R14] for an occurrence of R0.
+  // Set NZ/Z based on last compare.
+  repne_scan(r14, r0, r2, rscratch1);
+
+  // Unspill the temp. registers:
+  pop(pushed_registers, sp);
+
+  b(*L_failure, Assembler::NE);
+
+  // Success.  Cache the super we found and proceed in triumph.
+  str(super_klass, super_cache_addr);
+
+  if (L_success != &L_fallthrough) {
+    b(*L_success);
+  }
+
+#undef IS_A_TEMP
+
+  bind(L_fallthrough);
+}
+
+
+void MacroAssembler::verify_oop(Register reg, const char* s) {
+  if (!VerifyOops) return;
+
+  // Pass register number to verify_oop_subroutine
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop: %s: %s", reg->name(), s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop {");
+
+  stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  mov(r0, reg);
+  mov(rscratch1, (address)b);
+  mrs(r1);
+
+  // call indirectly to solve generation ordering problem
+  reg_printf("Verify oop entry, sp = %p, rfp = %p\n", sp, rfp);
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  bl(rscratch2);
+  reg_printf("Verify oop exit,  sp = %p, rfp = %p\n", sp, rfp);
+
+  msr(r1);
+  ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  BLOCK_COMMENT("} verify_oop");
+}
+
+void MacroAssembler::verify_oop_addr(Address addr, const char* s) {
+  if (!VerifyOops) return;
+
+  const char* b = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("verify_oop_addr: %s", s);
+    b = code_string(ss.as_string());
+  }
+  BLOCK_COMMENT("verify_oop_addr {");
+
+  stmdb(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+  mrs(r1);
+
+  // addr may contain sp so we will have to adjust it based on the
+  // pushes that we just did.
+  if (addr.uses(sp)) {
+    lea(r0, addr);
+    ldr(r0, Address(r0, 5 * wordSize));
+  } else {
+    ldr(r0, addr);
+  }
+  mov(rscratch1, (address)b);
+
+  // call indirectly to solve generation ordering problem
+  lea(rscratch2, ExternalAddress(StubRoutines::verify_oop_subroutine_entry_address()));
+  ldr(rscratch2, Address(rscratch2));
+  bl(rscratch2);
+
+  msr(r1);
+  ldmia(sp, RegSet::of(r0, r1, rscratch1, rscratch2, lr).bits());
+
+  BLOCK_COMMENT("} verify_oop_addr");
+}
+
+Address MacroAssembler::argument_address(RegisterOrConstant arg_slot,
+                                         int extra_slot_offset) {
+  // cf. TemplateTable::prepare_invoke(), if (load_receiver).
+  int stackElementSize = Interpreter::stackElementSize;
+  int offset = Interpreter::expr_offset_in_bytes(extra_slot_offset+0);
+#ifdef ASSERT
+  int offset1 = Interpreter::expr_offset_in_bytes(extra_slot_offset+1);
+  assert(offset1 - offset == stackElementSize, "correct arithmetic");
+#endif
+  if (arg_slot.is_constant()) {
+    return Address(sp, arg_slot.as_constant() * stackElementSize
+                   + offset);
+  } else {
+    add(rscratch1, sp, arg_slot.as_register(),
+        lsl(exact_log2(stackElementSize)));
+    return Address(rscratch1, offset);
+  }
+}
+
+void MacroAssembler::call_VM_leaf_base(address entry_point,
+                                       int number_of_arguments,
+                                       Label *retaddr) {
+  Label E, L;
+
+  //FIXME Do this alignment in a more elegant way
+  mov(rscratch2, sp);
+  sub(sp, sp, wordSize);
+  bic(sp, sp, 2 * wordSize - 1); // Align to eight bytes
+  str(rscratch2, Address(sp));
+
+  // FIXME Do we need to preserve rscratch2?
+  //str(rscratch2, Address(pre(sp, -wordSize)));
+
+  mov(rscratch2, entry_point);
+  reg_printf("\tJust about to call into the VM, rfp = %p\n", rfp);
+  bl(rscratch2);
+  if (retaddr)
+    bind(*retaddr);
+  reg_printf("\tReturned from call into the VM, rfp = %p\n", rfp);
+
+  //ldr(rscratch2, Address(post(sp, wordSize)));
+
+  //Undo alignment
+  ldr(sp, Address(sp));
+
+  maybe_isb();
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, int number_of_arguments) {
+  call_VM_leaf_base(entry_point, number_of_arguments);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::call_VM_leaf(address entry_point, Register arg_0,
+                                  Register arg_1, Register arg_2) {
+  pass_arg0(this, arg_0);
+  pass_arg1(this, arg_1);
+  pass_arg2(this, arg_2);
+  call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0) {
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 1);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1) {
+
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 2);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2) {
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 3);
+}
+
+void MacroAssembler::super_call_VM_leaf(address entry_point, Register arg_0, Register arg_1, Register arg_2, Register arg_3) {
+  assert(arg_0 != c_rarg3, "smashed arg");
+  assert(arg_1 != c_rarg3, "smashed arg");
+  assert(arg_2 != c_rarg3, "smashed arg");
+  pass_arg3(this, arg_3);
+  assert(arg_0 != c_rarg2, "smashed arg");
+  assert(arg_1 != c_rarg2, "smashed arg");
+  pass_arg2(this, arg_2);
+  assert(arg_0 != c_rarg1, "smashed arg");
+  pass_arg1(this, arg_1);
+  pass_arg0(this, arg_0);
+  MacroAssembler::call_VM_leaf_base(entry_point, 4);
+}
+
+// Clobbers rscratch1
+void MacroAssembler::null_check(Register reg, int offset) {
+  if (needs_explicit_null_check(offset)) {
+    // provoke OS NULL exception if reg = NULL by
+    // accessing M[reg] w/o changing any registers
+    // NOTE: this is plenty to provoke a segv
+    reg_printf("Generating OS check null with ptr = %p\n", reg);
+    assert(reg != rscratch1, "can't be");
+    ldr(rscratch1, Address(reg));
+  } else {
+    // nothing to do, (later) access of M[reg + offset]
+    // will provoke OS NULL exception if reg = NULL
+  }
+}
+
+// MacroAssembler protected routines needed to implement
+// public methods
+
+void MacroAssembler::mov(Register r, Address dest, Condition cond) {
+  code_section()->relocate(pc(), dest.rspec());
+  uint32_t imm32 = (uint32_t)dest.target();
+  movptr(r, imm32, cond);
+}
+
+// Move a constant pointer into r.  In aarch32 address space
+// is 32 bits in size and so a pointer can be encoded in two mov
+// instructions.
+void MacroAssembler::movptr(Register r, uintptr_t imm32, Condition cond) {
+#ifndef PRODUCT
+  {
+    char buffer[64];
+    snprintf(buffer, sizeof(buffer), "0x%"PRIX32, imm32);
+    block_comment(buffer);
+  }
+#endif
+  Assembler::mov_immediate32(r, imm32, cond, false);
+}
+
+void MacroAssembler::ret(Register reg) {
+  assert(reg == lr, "Can do return only to LR");
+  b(lr);
+}
+
+void MacroAssembler::atomic_inc(Register counter_addr, Register tmp) {
+  Label retry_load;
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  ldrex(tmp, counter_addr);
+  add(tmp, tmp, 1);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, tmp, counter_addr);
+  cmp(tmp, 0);
+  b(retry_load, Assembler::NE);
+}
+
+
+// MacroAssembler routines found actually to be needed
+
+void MacroAssembler::push(Register src)
+{
+  str(src, Address(pre(sp, -1 * wordSize)));
+}
+
+void MacroAssembler::pop(Register dst)
+{
+  ldr(dst, Address(post(sp, 1 * wordSize)));
+}
+
+// Note: load_unsigned_short used to be called load_unsigned_word.
+int MacroAssembler::load_unsigned_short(Register dst, Address src) {
+  int off = offset();
+  ldrh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_unsigned_byte(Register dst, Address src) {
+  int off = offset();
+  ldrb(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_short(Register dst, Address src) {
+  int off = offset();
+  ldrsh(dst, src);
+  return off;
+}
+
+int MacroAssembler::load_signed_byte(Register dst, Address src) {
+  int off = offset();
+  ldrsb(dst, src);
+  return off;
+}
+
+void MacroAssembler::load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2) {
+  switch (size_in_bytes) {
+  //case  8:  ldr(dst, src); break;
+  case  4:  ldr(dst, src); break;
+  case  2:  is_signed ? load_signed_short(dst, src) : load_unsigned_short(dst, src); break;
+  case  1:  is_signed ? load_signed_byte( dst, src) : load_unsigned_byte( dst, src); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2) {
+  switch (size_in_bytes) {
+  //case  8:  str(src, dst); break;
+  case  4:  str(src, dst); break;
+  case  2:  strh(src, dst); break;
+  case  1:  strb(src, dst); break;
+  default:  ShouldNotReachHere();
+  }
+}
+
+void MacroAssembler::decrement(Register reg, int value) {
+  if (value < 0) {
+    increment(reg, -value);
+    return;
+  }
+  if (value == 0) {
+    return;
+  }
+  if (operand_valid_for_add_sub_immediate(value)) {
+    sub(reg, reg, value);
+    return;
+  }
+  assert(reg != rscratch2, "invalid register for decrement");
+  mov(rscratch2, (unsigned int) value);
+  sub(reg, reg, rscratch2);
+}
+
+void MacroAssembler::decrement(Address dst, int value) {
+  assert(!dst.uses(rscratch1), "invalid address for decrement");
+  ldr(rscratch1, dst);
+  decrement(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+void MacroAssembler::increment(Register reg, int value) {
+  if (value < 0) {
+    decrement(reg, -value);
+    return;
+  }
+  if (value == 0) {
+    return;
+  }
+  if (operand_valid_for_add_sub_immediate(value)) {
+    add(reg, reg, value);
+    return;
+  }
+  assert(reg != rscratch2, "invalid register for increment");
+  mov(rscratch2, (unsigned int) value);
+  add(reg, reg, rscratch2);
+}
+
+void MacroAssembler::increment(Address dst, int value) {
+  assert(!dst.uses(rscratch1), "invalid address for increment");
+  ldr(rscratch1, dst);
+  increment(rscratch1, value);
+  str(rscratch1, dst);
+}
+
+// Loads and stores everything except the pc and sp
+void MacroAssembler::pusha() {
+  unsigned regset = 0b0101111111111111;
+  stmdb(sp, regset);
+}
+void MacroAssembler::popa() {
+  unsigned regset = 0b0101111111111111;
+  ldmia(sp, regset);
+}
+
+static void multiple_reg_check(unsigned int bitset, Register stack) {
+  const unsigned int pcbit = 1 << r15_pc->encoding();
+  const unsigned int lrbit = 1 << lr->encoding();
+  const unsigned int spbit = 1 << sp->encoding();
+  const unsigned int stackbit = 1 << stack->encoding();
+  assert(!(bitset & spbit), "The SP can be in the list. However, "
+      "ARM deprecates using these instructions with SP in the list.");
+  assert(!(bitset & pcbit) || !(bitset & lrbit),
+      "ARM deprecates using these instructions with both "
+      "the LR and the PC in the list.");
+  assert(!(bitset & stackbit), "Instructions with the base register "
+      "in the list and ! specified are only available before ARMv7, "
+      "and ARM deprecates the use of such instructions. "
+      "The value of the base register after such an instruction is UNKNOWN");
+}
+
+// Push lots of registers in the bit set supplied.  Don't push sp.
+// Return the number of words pushed
+int MacroAssembler::push(unsigned int bitset, Register stack) {
+  multiple_reg_check(bitset, stack);
+  unsigned bc = bitset, count = 0, i;
+  for(i = 0; i <= 15; i++) {
+    if (1 & bc) count++;
+    bc >>= 1;
+  }
+  // TODO Also why did it only do even quantities before?
+  stmdb(stack, bitset);
+  return count;
+}
+
+int MacroAssembler::pop(unsigned int bitset, Register stack) {
+  multiple_reg_check(bitset, stack);
+  unsigned bc = bitset, count = 0, i;
+  for(i = 0; i <= 15; i++) {
+    if (1 & bc) count++;
+    bc >>= 1;
+  }
+  // TODO Also why did it only do even quantities before?
+  ldmia(stack, bitset);
+  return count;
+}
+
+void MacroAssembler::resolve_jobject(Register value, Register thread, Register tmp) {
+  Label done, not_weak;
+  cbz(value, done);           // Use NULL as-is.
+
+  STATIC_ASSERT(JNIHandles::weak_tag_mask == 1u);
+  tbz(value, 0, not_weak);    // Test for jweak tag.
+
+  // Resolve jweak.
+
+  access_load_word_at(T_OBJECT, IN_NATIVE | ON_PHANTOM_OOP_REF,
+              value, Address(value, -JNIHandles::weak_tag_value), tmp, noreg);
+  verify_oop(value);
+  b(done);
+
+
+  bind(not_weak);
+  // Resolve (untagged) jobject.
+  access_load_word_at(T_OBJECT, IN_NATIVE, value, Address(value), tmp, noreg);
+  verify_oop(value);
+  bind(done);
+}
+
+void MacroAssembler::stop(const char* msg) {
+  pusha();
+  // Save old sp value
+  add(rscratch2, sp, 14 * wordSize);
+  str(rscratch2, Address(pre(sp, -4)));
+  mov(c_rarg0, (address)msg);
+  mov(c_rarg1, r15_pc);
+  sub(c_rarg1, c_rarg1, 8); // Restore to actual value
+  mov(c_rarg2, sp);
+  mov(c_rarg3, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
+  bl(c_rarg3);
+  hlt(0);
+}
+
+void MacroAssembler::unimplemented(const char* what) {
+  const char* buf = NULL;
+  {
+    ResourceMark rm;
+    stringStream ss;
+    ss.print("unimplemented: %s", what);
+    buf = code_string(ss.as_string());
+  }
+  stop(buf);
+}
+
+// this simulates the behaviour of the x86 cmpxchg instruction using a
+// load linked/store conditional pair. we use the acquire/release
+// versions of these instructions so that we flush pending writes as
+// per Java semantics.
+
+// n.b the x86 version assumes the old value to be compared against is
+// in rax and updates rax with the value located in memory if the
+// cmpxchg fails. we supply a register for the old value explicitly
+
+// the aarch32 load linked/store conditional instructions do not
+// accept an offset. so, unlike x86, we must provide a plain register
+// to identify the memory word to be compared/exchanged rather than a
+// register+offset Address.
+
+void MacroAssembler::cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldrex(tmp, addr);
+  cmp(tmp, oldv);
+  b(nope, Assembler::NE);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, newv, addr);
+  cmp(tmp, 0);
+  b(succeed, Assembler::EQ);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+void MacroAssembler::cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
+                                        Label &succeed, Label *fail) {
+  assert(oopDesc::mark_offset_in_bytes() == 0, "assumption");
+  cmpxchgptr(oldv, newv, obj, tmp, succeed, fail);
+}
+
+void MacroAssembler::cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
+                                Label &succeed, Label *fail) {
+  // oldv holds comparison value
+  // newv holds value to write in exchange
+  // addr identifies memory word to compare against/update
+  // tmp returns 0/1 for success/failure
+  Label retry_load, nope;
+
+  bind(retry_load);
+  // flush and load exclusive from the memory location
+  // and fail if it is not what we expect
+  ldrex(tmp, addr);
+  cmp(tmp, oldv);
+  b(nope, Assembler::NE);
+  // if we store+flush with no intervening write tmp wil be zero
+  strex(tmp, newv, addr);
+  cmp(tmp, 0);
+  b(succeed, Assembler::EQ);
+  // retry so we only ever return after a load fails to compare
+  // ensures we don't return a stale value after a failed write.
+  b(retry_load);
+  // if the memory word differs we return it in oldv and signal a fail
+  bind(nope);
+  membar(AnyAny);
+  mov(oldv, tmp);
+  if (fail)
+    b(*fail);
+}
+
+#ifndef PRODUCT
+extern "C" void findpc(intptr_t x);
+#endif
+
+void MacroAssembler::debug32(char* msg, int32_t pc, int32_t regs[])
+{
+  print_unseen_bytecodes();
+  // In order to get locks to work, we need to fake a in_VM state
+  if (ShowMessageBoxOnError) {
+    JavaThread* thread = JavaThread::current();
+    JavaThreadState saved_state = thread->thread_state();
+    thread->set_thread_state(_thread_in_vm);
+#ifndef PRODUCT
+    if (CountBytecodes || TraceBytecodes || StopInterpreterAt) {
+      ttyLocker ttyl;
+      BytecodeCounter::print();
+    }
+#endif
+    if (os::message_box(msg, "Execution stopped, print registers?")) {
+      ttyLocker ttyl;
+      tty->print_cr(" pc = 0x%016x", pc);
+#ifndef PRODUCT
+      tty->cr();
+      findpc(pc);
+      tty->cr();
+#endif
+      tty->print_cr("THIS IS WRONG!");
+      tty->print_cr(" r0 = 0x%016x", regs[0]);
+      tty->print_cr(" r1 = 0x%016x", regs[1]);
+      tty->print_cr(" r2 = 0x%016x", regs[2]);
+      tty->print_cr(" r3 = 0x%016x", regs[3]);
+      tty->print_cr(" r4 = 0x%016x", regs[4]);
+      tty->print_cr(" r5 = 0x%016x", regs[5]);
+      tty->print_cr(" r6 = 0x%016x", regs[6]);
+      tty->print_cr(" r7 = 0x%016x", regs[7]);
+      tty->print_cr(" r8 = 0x%016x", regs[8]);
+      tty->print_cr(" r9 = 0x%016x", regs[9]);
+      tty->print_cr("r10 = 0x%016x", regs[10]);
+      tty->print_cr("r11 = 0x%016x", regs[11]);
+      tty->print_cr("r12 = 0x%016x", regs[12]);
+      tty->print_cr("r13 = 0x%016x", regs[13]);
+      tty->print_cr("r14 = 0x%016x", regs[14]);
+      tty->print_cr("r15 = 0x%016x", regs[15]);
+      BREAKPOINT;
+    }
+    ThreadStateTransition::transition(thread, _thread_in_vm, saved_state);
+  } else {
+    {
+    ttyLocker ttyl;
+    ::tty->print_cr("=============== DEBUG MESSAGE: %s ================", msg);
+    ::tty->print_cr(" r0 [   arg0    ] = 0x%08x", regs[1]);
+    ::tty->print_cr(" r1 [   arg1    ] = 0x%08x", regs[2]);
+    ::tty->print_cr(" r2 [   arg2    ] = 0x%08x", regs[3]);
+    ::tty->print_cr(" r3 [   arg3    ] = 0x%08x", regs[4]);
+    ::tty->print_cr(" r4 [ rdispatch ] = 0x%08x", regs[5]);
+    ::tty->print_cr(" r5 [   rbcp    ] = 0x%08x", regs[6]);
+    ::tty->print_cr(" r6 [  rlocals  ] = 0x%08x", regs[7]);
+    ::tty->print_cr(" r7 [  rcpool   ] = 0x%08x", regs[8]);
+    ::tty->print_cr(" r8 [  rmethod  ] = 0x%08x", regs[9]);
+    ::tty->print_cr(" r9 [ rscratch1 ] = 0x%08x", regs[10]);
+    ::tty->print_cr("r10 [  rthread  ] = 0x%08x", regs[11]);
+    ::tty->print_cr("r11 [    rfp    ] = 0x%08x", regs[12]);
+    ::tty->print_cr("r12 [ rscratch2 ] = 0x%08x", regs[13]);
+    ::tty->print_cr("r13 [    sp     ] = 0x%08x", regs[0]);
+    ::tty->print_cr("r14 [    lr     ] = 0x%08x", regs[14]);
+    ::tty->print_cr("r15 [    pc     ] = 0x%08x", pc);
+    }
+    assert(false, "DEBUG MESSAGE: %s", msg);
+  }
+}
+
+void MacroAssembler::push_call_clobbered_registers() {
+  push(RegSet::range(r0, r3), sp);
+  if(hasFPU()) {
+    const int nfloat = 16; // number of callee-saved 32-bit float registers
+    vstmdb_f64(sp, (1 << nfloat/2) - 1);
+  }
+}
+
+void MacroAssembler::pop_call_clobbered_registers() {
+  if(hasFPU()) {
+    const int nfloat = 16; // number of callee-saved 32-bit float registers
+    vldmia_f64(sp, (1 << nfloat/2) - 1);
+  }
+  pop(RegSet::range(r0, r3), sp);
+}
+
+void MacroAssembler::push_CPU_state() {
+  // if fix this, update also RegisterSaved::save_live_registers and it's map
+  push(0x5fff, sp); // integer registers except sp & (aarch32 pc)
+
+  if(hasFPU()) {
+    const int nfloat = FPUStateSizeInWords / 2; // saved by pairs
+    vstmdb_f64(sp, (1 << nfloat) - 1);
+  } else {
+    sub(sp, sp, FPUStateSizeInWords * wordSize);
+  }
+}
+
+void MacroAssembler::pop_CPU_state() {
+  if(hasFPU()) {
+    const int nfloat = FloatRegisterImpl::number_of_registers / 2;
+    vldmia_f64(sp, (1 << nfloat) - 1);
+  } else {
+    add(sp, sp, FPUStateSizeInWords * wordSize);
+  }
+
+  pop(0x5fff, sp); // integer registers except sp & (aarch32 pc)
+}
+
+// appears this needs to round up!
+void MacroAssembler::round_to(Register reg, int modulus) {
+  // from x86
+  add(reg, reg, modulus - 1);
+  bic(reg, reg, modulus - 1); // and( reg, -modulus)
+}
+
+SkipIfEqual::SkipIfEqual(
+    MacroAssembler* masm, const bool* flag_addr, bool value) {
+  _masm = masm;
+  _masm->mov(rscratch1, ExternalAddress((address)flag_addr));
+  _masm->ldrb(rscratch1, rscratch1);
+  _masm->cmp(rscratch1, 0);
+  _masm->b(_label, value ? Assembler::NE : Assembler::EQ);
+}
+
+SkipIfEqual::~SkipIfEqual() {
+  _masm->bind(_label);
+}
+
+void MacroAssembler::cmpptr(Register src1, Address src2) {
+  mov(rscratch1, src2);
+  ldr(rscratch1, Address(rscratch1));
+  cmp(src1, rscratch1);
+}
+
+void MacroAssembler::cmpoop(Register obj1, Register obj2) {
+  BarrierSetAssembler* bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->obj_equals(this, obj1, obj2);
+}
+
+void MacroAssembler::load_klass(Register dst, Register src) {
+  ldr(dst, Address(src, oopDesc::klass_offset_in_bytes()));
+}
+
+// ((OopHandle)result).resolve();
+void MacroAssembler::resolve_oop_handle(Register result, Register tmp) {
+  // OopHandle::resolve is an indirection.
+  access_load_word_at(T_OBJECT, IN_NATIVE, result, Address(result), tmp, noreg);
+}
+
+void MacroAssembler::load_mirror(Register dst, Register method, Register tmp) {
+  const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+  ldr(dst, Address(rmethod, Method::const_offset()));
+  ldr(dst, Address(dst, ConstMethod::constants_offset()));
+  ldr(dst, Address(dst, ConstantPool::pool_holder_offset_in_bytes()));
+  ldr(dst, Address(dst, mirror_offset));
+  resolve_oop_handle(dst, tmp);
+}
+
+void MacroAssembler::cmp_klass(Register oop, Register trial_klass, Register tmp) {
+  ldr(tmp, Address(oop, oopDesc::klass_offset_in_bytes()));
+  cmp(trial_klass, tmp);
+}
+
+void MacroAssembler::load_prototype_header(Register dst, Register src) {
+  load_klass(dst, src);
+  ldr(dst, Address(dst, Klass::prototype_header_offset()));
+}
+
+void MacroAssembler::store_klass(Register dst, Register src) {
+  str(src, Address(dst, oopDesc::klass_offset_in_bytes()));
+}
+
+void MacroAssembler::store_klass_gap(Register dst, Register src) { }
+
+void MacroAssembler::access_load_word_at(BasicType type, DecoratorSet decorators,
+                                         Register dst, Address src,
+                                         Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  } else {
+    bs->load_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_store_word_at(BasicType type, DecoratorSet decorators,
+                                          Address dst, Register src,
+                                          Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  } else {
+    bs->store_word_at(this, decorators, type, dst, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_load_tos_at(BasicType type, DecoratorSet decorators,
+                                    Address src,
+                                    Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
+  } else {
+    bs->load_tos_at(this, decorators, type, src, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::access_store_tos_at(BasicType type, DecoratorSet decorators,
+                                         Address dst,
+                                         Register tmp1, Register thread_tmp) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  decorators = AccessInternal::decorator_fixup(decorators);
+  bool as_raw = (decorators & AS_RAW) != 0;
+  if (as_raw) {
+    bs->BarrierSetAssembler::store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
+  } else {
+    bs->store_tos_at(this, decorators, type, dst, tmp1, thread_tmp);
+  }
+}
+
+void MacroAssembler::load_heap_oop(Register dst, Address src, Register tmp1,
+                                   Register thread_tmp, DecoratorSet decorators) {
+  access_load_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
+}
+
+void MacroAssembler::load_heap_oop_not_null(Register dst, Address src, Register tmp1,
+                                            Register thread_tmp, DecoratorSet decorators) {
+  access_load_word_at(T_OBJECT, IN_HEAP | IS_NOT_NULL | decorators, dst, src, tmp1, thread_tmp);
+}
+
+void MacroAssembler::store_heap_oop(Address dst, Register src, Register tmp1,
+                                    Register thread_tmp, DecoratorSet decorators) {
+  access_store_word_at(T_OBJECT, IN_HEAP | decorators, dst, src, tmp1, thread_tmp);
+}
+
+// Used for storing NULLs.
+void MacroAssembler::store_heap_oop_null(Address dst, Register tmp) {
+  access_store_word_at(T_OBJECT, IN_HEAP, dst, noreg, tmp, noreg);
+}
+
+Address MacroAssembler::allocate_metadata_address(Metadata* obj) {
+  assert(oop_recorder() != NULL, "this assembler needs a Recorder");
+  int index = oop_recorder()->allocate_metadata_index(obj);
+  RelocationHolder rspec = metadata_Relocation::spec(index);
+  return Address((address)obj, rspec);
+}
+
+// Move an oop into a register.  immediate is true if we want
+// immediate instrcutions, i.e. we are not going to patch this
+// instruction while the code is being executed by another thread.  In
+// that case we can use move immediates rather than the constant pool.
+void MacroAssembler::movoop(Register dst, jobject obj, bool immediate) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_oop_index(obj);
+  } else {
+#ifdef ASSERT
+    {
+      ThreadInVMfromUnknown tiv;
+      assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "should be real oop");
+    }
+#endif
+    oop_index = oop_recorder()->find_index(obj);
+  }
+  if (! immediate) {
+    far_load_oop(dst, oop_index);
+  } else {
+    RelocationHolder rspec = oop_Relocation::spec(oop_index);
+    mov(dst, Address((address)obj, rspec));
+  }
+}
+
+// Move a metadata address into a register.
+void MacroAssembler::mov_metadata(Register dst, Metadata* obj) {
+  int oop_index;
+  if (obj == NULL) {
+    oop_index = oop_recorder()->allocate_metadata_index(obj);
+  } else {
+    oop_index = oop_recorder()->find_index(obj);
+  }
+  RelocationHolder rspec = metadata_Relocation::spec(oop_index);
+  mov(dst, Address((address)obj, rspec));
+}
+
+void MacroAssembler::far_load(Register dst, address addr) {
+  address far_load_addr = pc();
+  add(dst, r15_pc, 0);
+  ldr(dst, Address(dst));
+
+  NativeFarLdr* far_load = (NativeFarLdr*) far_load_addr;
+  far_load->set_data_addr((intptr_t*) addr);
+}
+
+void MacroAssembler::far_load_oop(Register dst, int oop_index) {
+    relocate(oop_Relocation::spec(oop_index));
+    // can't provide meaningful addr, give far_load addr itself
+    far_load(dst, pc());
+}
+
+void MacroAssembler::far_load_metadata(Register dst, int metadata_index) {
+    relocate(metadata_Relocation::spec(metadata_index));
+    // can't provide meaningful addr, give far_load addr itself
+    far_load(dst, pc());
+}
+
+void MacroAssembler::far_load_const(Register dst, address const_addr) {
+    relocate(section_word_Relocation::spec(const_addr, CodeBuffer::SECT_CONSTS));
+    far_load(dst, const_addr);
+}
+
+Address MacroAssembler::constant_oop_address(jobject obj) {
+#ifdef ASSERT
+  {
+    ThreadInVMfromUnknown tiv;
+    assert(oop_recorder() != NULL, "this assembler needs an OopRecorder");
+    assert(Universe::heap()->is_in_reserved(JNIHandles::resolve(obj)), "not an oop");
+  }
+#endif
+  int oop_index = oop_recorder()->find_index(obj);
+  return Address((address)obj, oop_Relocation::spec(oop_index));
+}
+
+// Defines obj, preserves var_size_in_bytes, okay for t2 == var_size_in_bytes.
+void MacroAssembler::tlab_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Register t2,
+                                   Label& slow_case) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->tlab_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, t2, slow_case);
+}
+
+// Defines obj, preserves var_size_in_bytes
+void MacroAssembler::eden_allocate(Register obj,
+                                   Register var_size_in_bytes,
+                                   int con_size_in_bytes,
+                                   Register t1,
+                                   Label& slow_case) {
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->eden_allocate(this, obj, var_size_in_bytes, con_size_in_bytes, t1, slow_case);
+}
+
+// Zero words; len is in bytes
+// Destroys all registers except addr
+// len must be a nonzero multiple of wordSize
+void MacroAssembler::zero_memory(Register addr, Register len, Register t1) {
+  assert_different_registers(addr, len, t1, rscratch1, rscratch2);
+
+#ifdef ASSERT
+  { Label L;
+    tst(len, BytesPerWord - 1);
+    b(L, Assembler::EQ);
+    stop("len is not a multiple of BytesPerWord");
+    bind(L);
+  }
+#endif
+
+#ifndef PRODUCT
+  block_comment("zero memory");
+#endif
+
+  Label loop;
+  Label entry;
+
+//  Algorithm:
+//
+//    scratch1 = cnt & 7;
+//    cnt -= scratch1;
+//    p += scratch1;
+//    switch (scratch1) {
+//      do {
+//        cnt -= 8;
+//          p[-8] = 0;
+//        case 7:
+//          p[-7] = 0;
+//        case 6:
+//          p[-6] = 0;
+//          // ...
+//        case 1:
+//          p[-1] = 0;
+//        case 0:
+//          p += 8;
+//      } while (cnt);
+//    }
+
+  const int unroll = 8; // Number of str instructions we'll unroll
+
+  lsr(len, len, LogBytesPerWord);
+  andr(rscratch1, len, unroll - 1);  // tmp1 = cnt % unroll
+  sub(len, len, rscratch1);      // cnt -= unroll
+  // t1 always points to the end of the region we're about to zero
+  add(t1, addr, rscratch1, lsl(LogBytesPerWord));
+  adr(rscratch2, entry);
+  sub(rscratch2, rscratch2, rscratch1, lsl(2));
+  mov(rscratch1, 0);
+  b(rscratch2);
+  bind(loop);
+  sub(len, len, unroll);
+  for (int i = -unroll; i < 0; i++)
+    str(rscratch1, Address(t1, i * wordSize));
+  bind(entry);
+  add(t1, t1, unroll * wordSize);
+  cbnz(len, loop);
+}
+
+void MacroAssembler::verify_tlab() {
+#ifdef ASSERT
+  if (UseTLAB && VerifyOops) {
+    Label next, ok;
+
+    strd(rscratch2, rscratch1, Address(pre(sp, -16)));
+
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_start_offset())));
+    cmp(rscratch2, rscratch1);
+    b(next, Assembler::HS);
+    STOP("assert(top >= start)");
+    should_not_reach_here();
+
+    bind(next);
+    ldr(rscratch2, Address(rthread, in_bytes(JavaThread::tlab_end_offset())));
+    ldr(rscratch1, Address(rthread, in_bytes(JavaThread::tlab_top_offset())));
+    cmp(rscratch2, rscratch1);
+    b(ok, Assembler::HS);
+    STOP("assert(top <= end)");
+    should_not_reach_here();
+
+    bind(ok);
+    ldrd(rscratch2, rscratch1, Address(post(sp, 16)));
+  }
+#endif
+}
+
+// Writes to stack successive pages until offset reached to check for
+// stack overflow + shadow pages.  This clobbers tmp.
+void MacroAssembler::bang_stack_size(Register size, Register tmp) {
+  assert_different_registers(tmp, size, rscratch1);
+  mov(tmp, sp);
+  // Bang stack for total size given plus shadow page size.
+  // Bang one page at a time because large size can bang beyond yellow and
+  // red zones.
+  Label loop;
+  mov(rscratch1, os::vm_page_size());
+  bind(loop);
+  lea(tmp, Address(tmp, -os::vm_page_size()));
+  subs(size, size, rscratch1);
+  str(size, Address(tmp));
+  b(loop, Assembler::GT);
+
+  // Bang down shadow pages too.
+  // At this point, (tmp-0) is the last address touched, so don't
+  // touch it again.  (It was touched as (tmp-pagesize) but then tmp
+  // was post-decremented.)  Skip this address by starting at i=1, and
+  // touch a few more pages below.  N.B.  It is important to touch all
+  // the way down to and including i=StackShadowPages.
+  for (int i = 0; i < (int)(JavaThread::stack_shadow_zone_size() / os::vm_page_size()) - 1; i++) {
+    // this could be any sized move but this is can be a debugging crumb
+    // so the bigger the better.
+    lea(tmp, Address(tmp, -os::vm_page_size()));
+    str(size, Address(tmp));
+  }
+}
+
+
+// Move the address of the polling page into dest.
+void MacroAssembler::get_polling_page(Register dest, address page, relocInfo::relocType rtype) {
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    ldr(dest, Address(rthread, Thread::polling_page_offset()));
+  } else {
+    mov(dest, Address(page, rtype));
+  }
+}
+
+// Move the address of the polling page into r, then read the polling
+// page.
+address MacroAssembler::read_polling_page(Register r, address page, relocInfo::relocType rtype) {
+  get_polling_page(r, page, rtype);
+  return read_polling_page(r, rtype);
+}
+
+address MacroAssembler::read_polling_page(Register r, relocInfo::relocType rtype) {
+  InstructionMark im(this);
+  code_section()->relocate(inst_mark(), rtype);
+  // It's ok to load to reg from reg + off (without write-back)
+  ldr(r, Address(r, 0));
+  return inst_mark();
+}
+
+// Helper functions for 64-bit multipliction, division and remainder
+// does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
+void MacroAssembler::mult_long(Register Rd, Register Rn, Register Rm) {
+  Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
+  Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
+  Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
+
+  mult_long(Rd, Rdh, Rn, Rnh, Rm, Rmh);
+}
+
+// does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
+void MacroAssembler::mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh) {
+  assert_different_registers(Rn, Rnh);
+  assert_different_registers(Rm, Rmh);
+  assert_different_registers(Rd, Rdh); // umull restriction
+  const Register t = rscratch1;
+
+  mul(t, Rm, Rnh);
+  mla(t, Rn, Rmh, t);
+  umull(Rd, Rdh, Rm, Rn);
+  add(Rdh, t, Rdh);
+}
+
+
+int64_t internal_ldiv(int64_t a, int64_t b) {
+  return a / b;
+}
+
+int64_t internal_lmod(int64_t a, int64_t b) {
+  return a % b;
+}
+
+void MacroAssembler::divide32(Register res, Register num, Register den, bool want_mod) {
+    Register cnt = rscratch1;
+    Register mod = rscratch2;
+    Register sign = r14;
+    assert_different_registers(num, den, rscratch1, rscratch2, r14);
+
+    // FIXME This works by first converting any negative values to positive ones, however
+    // it is not possible to express |INT_MIN|. Need to fix this
+
+    //Convert to positive values
+    mov(sign, 0);
+
+    cmp(num, 0);
+    mov(sign, 1, MI);
+    rsb(num, num, 0, MI);
+
+    cmp(den, 0);
+    if(!want_mod) eor(sign, sign, 1, MI);
+    rsb(den, den, 0, MI);
+
+    // Algorithm from
+    // http://www.chiark.greenend.org.uk/~theom/riscos/docs/ultimate/a252div.txt
+    // Graeme Williams
+    mov(cnt, 28);
+    mov(mod, num, lsr(4));
+    cmp(den, mod, lsr(12));
+    sub(cnt, cnt, 16, Assembler::LE);
+    mov(mod, mod, lsr(16), Assembler::LE);
+    cmp(den, mod, lsr(4));
+    sub(cnt, cnt, 8, Assembler::LE);
+    mov(mod, mod, lsr(8), Assembler::LE);
+    cmp(den, mod);
+    sub(cnt, cnt, 4, Assembler::LE);
+    mov(mod, mod, lsr(4), Assembler::LE);
+    mov(num, num, lsl(cnt));
+    rsb(den, den, 0);
+
+    adds(num, num, num);
+    //Now skip over cnt copies of the 3 instr. loop.
+    add(cnt, cnt, cnt, lsl(1));
+    add(r15_pc, r15_pc, cnt, lsl(2));
+    mov(r0, r0);
+
+    for(int i = 0; i < 32; i++) {
+        adcs(mod, den, mod, lsl(1));
+        sub(mod, mod, den, Assembler::LO);
+        adcs(num, num, num);
+    }
+
+    cmp(sign, 0);
+    rsb(res, want_mod? mod : num, 0, NE);
+    mov(res, want_mod? mod : num, EQ);
+}
+
+
+// <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
+// <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
+// <Rd> = <Rn> / <Rm>
+// <Rd> = <Rn> % <Rm>
+void MacroAssembler::divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder) {
+  //Dispatch to best possible
+  Register Rdh = (Register)(Rd->encoding_nocheck() + 1);
+  Register Rnh = (Register)(Rn->encoding_nocheck() + 1);
+  Register Rmh = (Register)(Rm->encoding_nocheck() + 1);
+
+  assert(32 == width || 64 == width, "Invalid width");
+  bool is64b = 64 == width;
+
+  if(is64b) {
+    assert_different_registers(Rn, Rnh, Rm, Rmh, rscratch1, rscratch2);
+  }
+
+  if(!is64b && VM_Version::features() & FT_HW_DIVIDE) {
+    // Emit a hw instruction sequnce.
+    if(want_remainder) {
+      sdiv(rscratch1, Rn, Rm);
+      mls(Rd, rscratch1, Rm, Rn);
+    } else {
+      sdiv(Rd, Rn, Rm);
+    }
+  } else if(!is64b) {
+    // Fall back to assembly software routine
+    divide32(Rd, Rn, Rm, want_remainder);
+  } else {
+    // Fall back to C software routine for
+    // 64 bit divide/mod
+    if(Rn != r0) {
+      mov(rscratch1, Rm);
+      mov(rscratch2, Rmh);
+
+      mov(r0, Rn);
+      mov(r1, Rnh);
+
+      mov(r2, rscratch1);
+      mov(r3, rscratch2);
+    } else if(Rm != r2) {
+      mov(r2, Rm);
+      mov(r3, Rmh);
+    }
+    address function;
+    if(want_remainder) function = (address)internal_lmod;
+    else               function = (address)internal_ldiv;
+
+    mov(rscratch1, function);
+    bl(rscratch1);
+    if(Rd != r0) {
+      mov(Rd, r0);
+      if(is64b) mov(Rdh, r1);
+    }
+  }
+}
+
+void MacroAssembler::extract_bits(Register dest, Register source, int lsb, int width) {
+  assert(lsb >= 0 && lsb + width <= 32 && width != 0, "Invalid lsb/width");
+  // Dispatch to the best sequence
+  if(0 == (lsb & 7) && (width == 8 || width == 16 || width == 32)) {
+    // Can use extend X
+    switch(width){
+      case 8:  uxtb(dest, source, ror(lsb)); break;
+      case 16: uxth(dest, source, ror(lsb)); break;
+      default:                               break;
+   }
+  } else if(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2)) {
+    ubfx(dest, source, lsb, width);
+  } else {
+    // Do two shifts
+    lsl(dest, source, 32 - (width + lsb));
+    lsr(dest, dest, 32 - width);
+  }
+}
+
+
+void MacroAssembler::atomic_ldrd(Register Rt, Register Rt2, Register Rbase) {
+  assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
+  assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
+  if(VM_Version::features() & FT_SINGLE_CORE) {
+    ldrd(Rt, Rbase);
+  } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
+#ifdef ASSERT
+    Label lbl;
+    tst(Rbase, 7);
+    b(lbl, EQ);
+    stop("atomic_ldrd is not doubleword aligned!");
+    bind(lbl);
+#endif // ASSERT
+
+    ldrexd(Rt, Rbase);
+  } else {
+    // TODO: Find Java way of logging
+    static bool warning_printed = false;
+    if(!warning_printed) {
+      fprintf(stderr, "Unable to provide atomic doubleword load.\n");
+      warning_printed = true;
+    }
+    ldrd(Rt, Rbase);
+  }
+}
+
+void MacroAssembler::atomic_strd(Register Rt, Register Rt2, Register Rbase,
+                                 Register temp, Register temp2) {
+  assert(Rt->encoding_nocheck() % 2 == 0, "Must be an even register");
+  assert((Register) (Rt + 1) == Rt2, "Must be contiguous");
+  assert((Register) (temp + 1) == temp2, "Must be contiguous");
+  assert_different_registers(temp, Rt, Rbase, temp2);
+  if(VM_Version::features() & FT_SINGLE_CORE) {
+    strd(Rt, Rbase);
+  } else if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6K)) {
+    // First need to gain exclusive access
+    Label retry;
+
+#ifdef ASSERT
+    tst(Rbase, 7);
+    b(retry, EQ);
+    stop("atomic_strd is not doubleword aligned!");
+#endif // ASSERT
+
+    bind(retry);
+    ldrexd(temp, Rbase);
+    strexd(temp, Rt, Rbase);
+    cmp(temp, 0);
+    b(retry, NE);
+  } else {
+    // TODO: Find Java way of logging
+    static bool warning_printed = false;
+    if(!warning_printed) {
+      fprintf(stderr, "Unable to provide atomic doubleword store.\n");
+      warning_printed = true;
+    }
+    strd(Rt, Rbase);
+  }
+}
+
+
+#define ENABLE_DEBUGGING 0
+// Helloworld is 2,482,397
+uint32_t MacroAssembler::bytecodes_until_print = 2400000; //13795328; //6888000L; //6881772L;
+
+uint32_t MacroAssembler::bytecodes_executed = 0;
+
+int MacroAssembler::enable_debug = 0;
+int MacroAssembler::enable_method_debug = 0;
+int MacroAssembler::enable_debugging_static = ENABLE_DEBUGGING;
+
+#define N_J_BYTECODES 238
+const char* j_bytecodes[N_J_BYTECODES] = {"nop", "aconstnull", "iconstm1", "iconst0", "iconst1", "iconst2", "iconst3", "iconst4", "iconst5", "lconst0",
+"lconst1", "fconst0", "fconst1", "fconst2", "dconst0", "dconst1", "bipush", "sipush", "ldc", "ldcw", "ldc2w",
+"iload", "lload", "fload", "dload", "aload", "iload0", "iload1", "iload2", "iload3", "lload0", "lload1", "lload2",
+"lload3", "fload0", "fload1", "fload2", "fload3", "dload0", "dload1", "dload2", "dload3", "aload0", "aload1", "aload2",
+"aload3", "iaload", "laload", "faload", "daload", "aaload", "baload", "caload", "saload", "istore", "lstore", "fstore",
+"dstore", "astore", "istore0", "istore1", "istore2", "istore3", "lstore0", "lstore1", "lstore2", "lstore3", "fstore0",
+"fstore1", "fstore2", "fstore3", "dstore0", "dstore1", "dstore2", "dstore3", "astore0", "astore1", "astore2", "astore3",
+"iastore", "lastore", "fastore", "dastore", "aastore", "bastore", "castore", "sastore", "pop", "pop2", "dup", "dupx1",
+"dupx2", "dup2", "dup2x1", "dup2x2", "swap", "iadd", "ladd", "fadd", "dadd", "isub", "lsub", "fsub", "dsub", "imul",
+"lmul", "fmul", "dmul", "idiv", "ldiv", "fdiv", "ddiv", "irem", "lrem", "frem", "drem", "ineg", "lneg", "fneg", "dneg",
+"ishl", "lshl", "ishr", "lshr", "iushr", "lushr", "iand", "land", "ior", "lor", "ixor", "lxor", "iinc", "i2l", "i2f",
+"i2d", "l2i", "l2f", "l2d", "f2i", "f2l", "f2d", "d2i", "d2l", "d2f", "i2b", "i2c", "i2s", "lcmp", "fcmpl", "fcmpg",
+"dcmpl", "dcmpg", "ifeq", "ifne", "iflt", "ifge", "ifgt", "ifle", "ificmpeq", "ificmpne", "ificmplt", "ificmpge",
+"ificmpgt", "ificmple", "ifacmpeq", "ifacmpne", "goto", "jsr", "ret", "tableswitch", "lookupswitch", "ireturn",
+"lreturn", "freturn", "dreturn", "areturn", "return", "getstatic", "putstatic", "getfield", "putfield",
+"invokevirtual", "invokespecial", "invokestatic", "invokeinterface", "invokedynamic", "new", "newarray",
+"anewarray", "arraylength", "athrow", "checkcast", "instanceof", "monitorenter", "monitorexit", "wide",
+"multianewarray", "ifnull", "ifnonnull", "gotow", "jsrw", "breakpoint", "fast_agetfield", "fast_bgetfield",
+"fast_cgetfield", "fast_dgetfield", "fast_fgetfield", "fast_igetfield", "fast_lgetfield", "fast_sgetfield",
+"fast_aputfield", "fast_bputfield", "fast_cputfield", "fast_dputfield", "fast_fputfield", "fast_iputfield",
+"fast_lputfield", "fast_sputfield", "fast_aload_0", "fast_iaccess_0", "fast_aaccess_0", "fast_faccess_0",
+"fast_iload", "fast_iload2", "fast_icaload", "fast_invokevfinal", "fast_linearswitch", "fast_binaryswitch",
+"fast_aldc", "fast_aldc_w", "return_register_finalizer", "invokehandle", "nofast_getfield", "nofast_putfield",
+"nofast_aload_0", "nofast_iload", "INVALID"};
+
+int bytecodes_seen[256];
+
+void MacroAssembler::init_unseen_bytecodes() {
+  for(int i = 0; i < 256; i++ ) {
+    bytecodes_seen[i] = 0;
+  }
+}
+
+void MacroAssembler::bytecode_seen(Register bc_reg, Register scratch) {
+  if(ENABLE_DEBUGGING) {
+    mov(scratch, (address)bytecodes_seen);
+    add(scratch, scratch, bc_reg, lsl(2));
+    add(bc_reg, bc_reg, 1);
+    str(bc_reg, Address(scratch));
+    sub(bc_reg, bc_reg, 1);
+  }
+}
+
+void MacroAssembler::print_unseen_bytecodes() {
+  if(ENABLE_DEBUGGING) {
+    printf("=== Unseen bytecodes ===\n");
+    for(int i = 0; i < N_J_BYTECODES; i++) {
+      if(0 == bytecodes_seen[i]) {
+        printf("\t%s\n", j_bytecodes[i]);
+      }
+    }
+    printf("=== End unseen ===\n");
+  } else {
+    printf("Not kept track, enable debugging to view info\n");
+  }
+  fflush(stdout);
+}
+
+int machine_state_regset = 0b0101111111111111;
+int machine_state_float_regset = 0b11;
+
+void MacroAssembler::save_machine_state() {
+    stmdb(sp, machine_state_regset);
+    if(hasFPU()) {
+        vstmdb_f64(sp, machine_state_float_regset);
+    }
+    enter();
+}
+
+void MacroAssembler::restore_machine_state() {
+    leave();
+    if(hasFPU()) {
+        vldmia_f64(sp, machine_state_float_regset);
+    }
+    ldmia(sp, machine_state_regset);
+}
+
+void internal_internal_printf(const char *fmt, ...) {
+  va_list args;
+  va_start (args, fmt);
+  vprintf (fmt, args);
+  fflush(stdout);
+  va_end(args);
+}
+
+void internal_printf(const char *format, uint32_t a, uint32_t b, uint32_t c) {
+  char buf[2048];
+  char fmt[2048];
+  buf[0] = '\0';
+  const char *thread_str = "THREAD 0x%08x : ";
+  int id = pthread_self();
+  strcpy(fmt, format);
+
+  char *str = strtok(fmt, "\n");
+  int nreplace = 0;
+  while(str) {
+    strcpy(buf, thread_str);
+    strcat(buf, str);
+    strcat(buf, "\n");
+    internal_internal_printf((const char*)buf, id, a, b, c);
+    str = strtok(NULL, "\n");
+  }
+}
+
+void MacroAssembler::get_bytecode(Register dst, Register bc) {
+  if(ENABLE_DEBUGGING) {
+    int nbytecodes = N_J_BYTECODES;
+    mov(dst, (address)j_bytecodes);
+    cmp(bc, nbytecodes);
+
+    ldr(dst, Address(dst, bc, lsl(2)), Assembler::LT);
+    ldr(dst, Address(dst, wordSize * nbytecodes), Assembler::GE);
+  }
+}
+
+int invocation_depth_count = -1; //TODO remove this with debugging info
+
+#define MAX_FCALL_DEPTH 4096
+struct thread_method_record{
+  int thread_id;
+  char names[MAX_FCALL_DEPTH][512];
+  int invocation_depth_count;
+};
+int ntmrs = 0;
+#define MAX_TMRS 10
+thread_method_record tmr_list[MAX_TMRS];
+
+void push_tmr(Method *meth, int *thread_id, int *invocation_depth_count, char **name) {
+  int id = pthread_self();
+  *thread_id = id;
+  for(int i = 0; i < ntmrs; i++) {
+    thread_method_record *tmr = &tmr_list[i];
+    if(id == tmr->thread_id) {
+      // Add a new frame
+      if(tmr->invocation_depth_count >= -1 &&
+        tmr->invocation_depth_count < (MAX_FCALL_DEPTH - 1)) {
+        *invocation_depth_count = ++(tmr->invocation_depth_count);
+        *name = tmr->names[tmr->invocation_depth_count];
+        meth->name_and_sig_as_C_string(tmr->names[tmr->invocation_depth_count], 512);
+        return;
+      } else {
+        fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
+        exit(1);
+      }
+    }
+  }
+  // Add a new thread
+  if(ntmrs >= MAX_TMRS) {
+    fprintf(stderr, "Too many tmrs\n");
+    exit(1);
+  }
+  //Create a new tmr
+  tmr_list[ntmrs].thread_id = id;
+  tmr_list[ntmrs].invocation_depth_count = 0;
+  meth->name_and_sig_as_C_string(tmr_list[ntmrs].names[0], 512);
+  *invocation_depth_count = 0;
+  *name = tmr_list[ntmrs].names[0];
+  ntmrs++;
+}
+
+void pop_tmr(int *thread_id, int *invocation_depth_count, char **name) {
+  int id = pthread_self();
+  *thread_id = id;
+  for(int i = 0; i < ntmrs; i++) {
+    thread_method_record *tmr = &tmr_list[i];
+    if(id == tmr->thread_id) {
+      if(tmr->invocation_depth_count >= 0 &&
+        tmr->invocation_depth_count < MAX_FCALL_DEPTH) {
+        // Pop frame
+        *name = tmr->names[tmr->invocation_depth_count];
+        *invocation_depth_count = (tmr->invocation_depth_count)--;
+        return;
+      } else if ( -1 == tmr->invocation_depth_count) {
+        *name = (char*)"JVM-EXCEPTION-EXIT:(NOT-REALLY-A-FRAME)";
+        *invocation_depth_count = 0;
+        return;
+      } else {
+        fprintf(stderr, "%s : Invalid fcall depth index, %d\n", __FUNCTION__, tmr->invocation_depth_count);
+        exit(1);
+      }
+    }
+  }
+  fprintf(stderr, "Unable to find suitable tmr\n");
+  exit(1);
+}
+
+void prepare_entry_exit_prefix(char *buf, int id, int invocation_depth_count) {
+  sprintf(buf, "THREAD 0x%08x : ", id);
+  for(int i = 0; i < invocation_depth_count; i++) {
+    strcat(buf, "  ");
+  }
+}
+
+
+void print_entry(Method *meth, int native) {
+  char *name;
+  int invocation_depth_count, id;
+  push_tmr(meth, &id, &invocation_depth_count, &name);
+
+  if(MacroAssembler::enable_method_debug) {
+    char buf[4096], buf_b[2048];
+    prepare_entry_exit_prefix(buf, id, invocation_depth_count);
+    if(native) {
+      sprintf(buf_b, "CALL NATIVE : %s\n", name);
+    } else {
+      sprintf(buf_b, "CALL JAVA   : %s\n", name);
+    }
+    strcat(buf, buf_b);
+    printf("%s", buf);
+    fflush(stdout);
+  }
+}
+
+void print_exit(bool normal) {
+  char *name;
+  int invocation_depth_count, id;
+  pop_tmr(&id, &invocation_depth_count, &name);
+
+  if(MacroAssembler::enable_method_debug) {
+    char buf[4096], buf_b[2048];
+    prepare_entry_exit_prefix(buf, id, invocation_depth_count);
+    sprintf(buf_b, normal ? "EXIT        : %s\n" : "EXCPN EXIT  : %s\n", name);
+    strcat(buf, buf_b);
+    printf("%s", buf);
+    fflush(stdout);
+  }
+}
+
+void MacroAssembler::print_method_entry(Register rmethod, bool native) {
+  if(ENABLE_DEBUGGING) {
+    save_machine_state();
+
+    bic(sp, sp, 7); // 8-byte align stack
+    mov(rscratch2, (address)print_entry);
+    mov(r0, rmethod);
+    mov(r1, native);
+    bl(rscratch2);
+
+    restore_machine_state();
+  }
+}
+
+void MacroAssembler::print_method_exit(bool normal) {
+  if(ENABLE_DEBUGGING) {
+    save_machine_state();
+
+    bic(sp, sp, 7); // 8-byte align stack
+    mov(rscratch2, (address)print_exit);
+    mov(r0, normal);
+    bl(rscratch2);
+
+    restore_machine_state();
+  }
+}
+
+void MacroAssembler::reg_printf_internal(bool important, const char *fmt, Register ra, Register rb, Register rc) {
+  if(ENABLE_DEBUGGING) {
+    Label skip;
+    save_machine_state();
+
+        mov(rscratch1, ra);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+        mov(rscratch1, rb);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+        mov(rscratch1, rc);
+        str(rscratch1, Address(pre(sp, -wordSize)));
+
+        if(!important) {
+            mov(r0, (address)&enable_debug);
+            ldr(r0, Address(r0));
+            cmp(r0, 0);
+            b(skip, Assembler::EQ);
+        }
+
+        int sp_difference = wordSize * (count_bits(machine_state_regset) +
+                                        2 * count_bits(machine_state_float_regset) +
+                                        2 + 3); //Frame entry and saved
+
+        mov(r0, (address)fmt);
+        if(ra != sp) ldr(r1, Address(sp, 2 * wordSize));
+        else         add(r1, sp, sp_difference);
+
+        if(rb != sp) ldr(r2, Address(sp, wordSize));
+        else         add(r2, sp, sp_difference);
+
+        if(rc != sp) ldr(r3, Address(sp));
+        else         add(r3, sp, sp_difference);
+
+        bic(sp, sp, 7); // 8-byte align stack
+
+        mov(rscratch2, (address)internal_printf);
+        bl(rscratch2);
+
+        bind(skip);
+        restore_machine_state();
+    }
+}
+
+void MacroAssembler::reg_printf(const char *fmt, Register ra, Register rb, Register rc) {
+    reg_printf_internal(false, fmt, ra, rb, rc);
+}
+
+void MacroAssembler::reg_printf_important(const char *fmt, Register ra, Register rb, Register rc) {
+  reg_printf_internal(true, fmt, ra, rb, rc);
+}
+
+// When debugging, set the break on bkpnt
+void bkpnt() { return; }
+void MacroAssembler::create_breakpoint() {
+    if(ENABLE_DEBUGGING) {
+        save_machine_state();
+        bic(sp, sp, 7); // 8-byte align stack
+
+        mov(rscratch2, (address) bkpnt);
+        bl(rscratch2);
+
+        restore_machine_state();
+    }
+}
+
+
+void MacroAssembler::print_cpool(InstanceKlass *klass) {
+    ttyLocker ttyl;
+    klass->constants()->print_on(tty);
+}
+
+int MacroAssembler::ldrd(Register Rt, Register Rt2, const Address& adr, Register Rtmp, Condition cond) {
+    if((0 == Rt->encoding_nocheck() % 2 &&
+         (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
+      (uabs(adr.offset()) < (1 << 8))) {
+      /* Good to go with a ldrd */
+      ldrd(Rt, adr, cond);
+      return 0x0;
+    } else {
+      return double_ld_failed_dispatch(Rt, Rt2, adr, &Assembler::ldm,
+                                &Assembler::ldr, Rtmp, cond);
+    }
+}
+
+int MacroAssembler::strd(Register Rt, Register Rt2, const Address& adr, Condition cond) {
+    if((0 == Rt->encoding_nocheck() % 2 &&
+         (Rt->encoding_nocheck() + 1 == Rt2->encoding_nocheck())) &&
+      (uabs(adr.offset()) < (1 << 8))) {
+      /* Good to go with a strd */
+      strd(Rt, adr, cond);
+    } else {
+      double_ldst_failed_dispatch(Rt, Rt2, adr, &Assembler::stm, &Assembler::str, cond);
+    }
+    return 0x0;
+}
+
+int MacroAssembler::double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+        void (Assembler::* mul)(unsigned, const Address&, Condition),
+        void (Assembler::* sgl)(Register, const Address&, Condition),
+        Register Rtmp, Condition cond) {
+  if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
+          (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
+    /* Do a load or store multiple instruction */
+    (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
+  } else if (!adr.uses(Rt)) {
+    double_ldst_failed_dispatch(Rt, Rt2, adr, mul, sgl, cond);
+  } else {
+    // need to reshuffle operation, otherwise write to Rt destroys adr
+    if (adr.get_mode() != Address::reg) {
+      // offset-based addressing. hence Rt2 could not be by adr
+      if (adr.get_wb_mode() == Address::pre) {
+        (this->*sgl)(Rt2, Address(pre(adr.base(), adr.offset() + wordSize)), cond);
+        (this->*sgl)(Rt, Address(pre(adr.base(), -wordSize)), cond);
+      } else if (adr.get_wb_mode() == Address::post) {
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+        (this->*sgl)(Rt, adr, cond);
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+        (this->*sgl)(Rt, adr, cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    } else {
+      // index-based addressing. both Rt and Rt2 could be used by adr
+      // hence temp register is necessary
+      adr.lea(this, Rtmp);
+      double_ldst_failed_dispatch(Rt, Rt2, Address(Rtmp), mul, sgl, cond);
+      // adr.lea have only address manipulation and cannot cause trap.
+      // first instruction when NPE can occur is in double_ldst_failed_dispatch
+      // so shift offset appropriately
+      return 0x4;
+    }
+  }
+  return 0x0;
+}
+
+void MacroAssembler::double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+        void (Assembler::* mul)(unsigned, const Address&, Condition),
+        void (Assembler::* sgl)(Register, const Address&, Condition),
+        Condition cond) {
+  if (can_ldst_multiple(RegSet::of(Rt, Rt2).bits(), adr) &&
+          (Rt->encoding_nocheck() < Rt2->encoding_nocheck())) {
+    /* Do a store multiple instruction */
+    (this->*mul)(RegSet::of(Rt, Rt2).bits(), adr, cond);
+  } else {
+    if (adr.get_mode() != Address::reg) {
+      // offset-based addressing
+      if (adr.get_wb_mode() == Address::pre) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
+      } else if (adr.get_wb_mode() == Address::post) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize - adr.offset()), cond);
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt, adr, cond);
+        (this->*sgl)(Rt2, Address(adr.base(), adr.offset() + wordSize), cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    } else {
+      // index-based addressing
+      if (adr.get_wb_mode() == Address::pre) {
+        // current implementation does not use Address::pre for indexed access
+        ShouldNotReachHere();
+      } else if (adr.get_wb_mode() == Address::post) {
+        // current implementation does not use Address:post for indexed access
+        // enable the code below and implement proper post() method if it is required
+#if 0
+        (this->*sgl)(Rt, Address(post(adr.base(), wordSize)), cond);
+        (this->*sgl)(Rt2, Address(post(adr.base(), adr.index(), adr.shift())), cond);
+        sub(adr.base(), wordSize, cond);
+#endif
+        ShouldNotReachHere();
+      } else if (adr.get_wb_mode() == Address::off) {
+        (this->*sgl)(Rt, Address(pre(adr.base(), adr.index(), adr.shift(), adr.op())), cond);
+        (this->*sgl)(Rt2, Address(adr.base(), wordSize), cond);
+        compensate_addr_offset(adr, cond);
+      } else {
+        ShouldNotReachHere();
+      }
+    }
+  }
+}
+
+#ifdef ASSERT
+void MacroAssembler::verify_stack_alignment() {
+  if (StackAlignmentInBytes > 4) {
+    Label x;
+    tst(sp, StackAlignmentInBytes-1);
+    b(x, EQ);
+    stop("stack unaligned");
+    bind(x);
+  }
+}
+#endif
+
+/**
+ * Code for BigInteger::multiplyToLen() instrinsic.
+ *
+ * r0: x
+ * r1: xlen
+ * r2: y
+ * r3: ylen
+ * r4:  z
+ * r5: zlen
+ *
+ */
+void MacroAssembler::multiply_to_len(Register x, Register xlen, Register y, Register ylen,
+                                     Register z, Register zlen,
+                                     Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                                     Register tmp5, Register tmp6) {
+
+  assert_different_registers(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
+
+  const Register xc = xlen;
+  const Register yc = tmp1;
+  const Register zc = tmp2;
+
+  const Register vz = tmp3;
+  const Register carry = tmp4;
+  const Register vx = tmp5;
+  const Register vy = tmp6;
+
+  // ensure y (inner cycle) is shorter than x (outer cycle), this in theory uses CPU caches more effectively
+  Label L_x_longer;
+  cmp(xlen, ylen);
+  b(L_x_longer, Assembler::GE);
+#define SWP(X, Y) \
+  mov(tmp1, Y); \
+  mov(Y, X); \
+  mov(X, tmp1)
+  SWP(x, y);
+  SWP(xlen, ylen);
+  bind(L_x_longer);
+
+  lea(xc, Address(x, xlen, lsl(LogBytesPerInt))); // x[xstart]
+  lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[idx]
+  lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[kdx]
+
+  // First Loop.
+  //
+  //  final static long LONG_MASK = 0xffffffffL;
+  //  int xstart = xlen - 1;
+  //  int ystart = ylen - 1;
+  //  long carry = 0;
+  //  for (int idx=ystart, kdx=ystart+1+xstart; idx >= 0; idx--, kdx--) {
+  //    long product = (y[idx] & LONG_MASK) * (x[xstart] & LONG_MASK) + carry;
+  //    z[kdx] = (int)product;
+  //    carry = product >>> 32;
+  //  }
+  //  z[xstart] = (int)carry;
+  //
+
+  ldr(vx, Assembler::pre(xc, -BytesPerInt));
+  mov(carry, 0);
+
+  Label L_loop_1;
+  bind(L_loop_1);
+  ldr(vy, Assembler::pre(yc, -BytesPerInt));
+  mov(vz, 0);
+  umaal(vz, carry, vx, vy);
+  str(vz, Assembler::pre(zc, -BytesPerInt));
+  cmp(yc, y);
+  b(L_loop_1, Assembler::GT);
+
+  str(carry, Address(zc, -BytesPerInt));
+
+  // Second and third (nested) loops.
+  //
+  // for (int i = xstart-1; i >= 0; i--) { // Second loop
+  //   carry = 0;
+  //   for (int jdx=ystart, k=ystart+1+i; jdx >= 0; jdx--, k--) { // Third loop
+  //     long product = (y[jdx] & LONG_MASK) * (x[i] & LONG_MASK) +
+  //                    (z[k] & LONG_MASK) + carry;
+  //     z[k] = (int)product;
+  //     carry = product >>> 32;
+  //   }
+  //   z[i] = (int)carry;
+  // }
+  //
+  Label L_loop_2, L_loop_3;
+  bind(L_loop_2);
+
+  sub(zlen, zlen, 1);
+  lea(yc, Address(y, ylen, lsl(LogBytesPerInt))); // y[jdx]
+  lea(zc, Address(z, zlen, lsl(LogBytesPerInt))); // z[k]
+
+  ldr(vx, Assembler::pre(xc, -BytesPerInt));
+  mov(carry, 0);
+
+  bind(L_loop_3);
+  ldr(vy, Assembler::pre(yc, -BytesPerInt));
+  ldr(vz, Assembler::pre(zc, -BytesPerInt)); // r1 is vz, r2 is carry
+  umaal(vz, carry, vx, vy);
+  str(vz, Address(zc));
+  cmp(yc, y);
+  b(L_loop_3, Assembler::GT);
+
+  str(carry, Address(zc, -BytesPerInt));
+  cmp(xc, x);
+  b(L_loop_2, Assembler::GT);
+}
+
+/**
+ * Code for BigInteger::mulAdd() instrinsic.
+ *
+ * r0: out
+ * r1: in
+ * r2: offset
+ * r3: len
+ * r4: k
+ */
+void MacroAssembler::mul_add(Register out, Register in, Register offset, Register len, Register k,
+                              Register tmp1, Register tmp2, Register tmp3) {
+
+  assert_different_registers(out, in, offset, len, k, tmp1, tmp2, tmp3);
+
+  Register vin = tmp1;
+  Register vout = tmp2;
+  Register carry = tmp3;
+  Register result = r0;
+
+//        long kLong = k & LONG_MASK;
+//        long carry = 0;
+//
+//        offset = out.length-offset - 1;
+//        for (int j=len-1; j >= 0; j--) {
+//            long product = (in[j] & LONG_MASK) * kLong +
+//                           (out[offset] & LONG_MASK) + carry;
+//            out[offset--] = (int)product;
+//            carry = product >>> 32;
+//        }
+//        return (int)carry;
+
+  lea(in, Address(in, len, lsl(LogBytesPerInt)));
+  lea(out, Address(out, offset, lsl(LogBytesPerInt)));
+  mov(carry, 0);
+
+  Label L_loop;
+  bind(L_loop);
+  ldr(vin, Assembler::pre(in, -BytesPerInt));
+  ldr(vout, Assembler::pre(out, -BytesPerInt));
+  umaal(vout, carry, vin, k);
+  str(vout, Address(out));
+  subs(len, len, 1);
+  b(L_loop, Assembler::GT);
+
+  mov(result, carry);
+}
+
+/**
+ * Emits code to update CRC-32 with a byte value according to constants in table
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]val       Register containing the byte to fold into the CRC.
+ * @param [in]table     Register containing the table of crc constants.
+ *
+ * uint32_t crc;
+ * val = crc_table[(val ^ crc) & 0xFF];
+ * crc = val ^ (crc >> 8);
+ *
+ */
+void MacroAssembler::update_byte_crc32(Register crc, Register val, Register table) {
+  eor(val, val, crc);
+  andr(val, val, 0xff);
+  ldr(val, Address(table, val, lsl(2)));
+  eor(crc, val, crc, Assembler::lsr(8));
+}
+
+/**
+ * Emits code to update CRC-32 with a 32-bit value according to tables 0 to 3
+ *
+ * @param [in,out]crc   Register containing the crc.
+ * @param [in]v         Register containing the 32-bit to fold into the CRC.
+ * @param [in]table0    Register containing table 0 of crc constants.
+ * @param [in]table1    Register containing table 1 of crc constants.
+ * @param [in]table2    Register containing table 2 of crc constants.
+ * @param [in]table3    Register containing table 3 of crc constants.
+ *
+ * uint32_t crc;
+ *   v = crc ^ v
+ *   crc = table3[v&0xff]^table2[(v>>8)&0xff]^table1[(v>>16)&0xff]^table0[v>>24]
+ *
+ */
+void MacroAssembler::update_word_crc32(Register crc, Register v, Register tmp,
+        Register tmp2, Register table0, Register table1, Register table2, Register table3) {
+  eor(v, crc, v);
+  uxtb(tmp, v);
+  uxtb(tmp2, v, ror(8));
+  ldr(crc, Address(table3, tmp, lsl(2)));
+  ldr(tmp2, Address(table2, tmp2, lsl(2)));
+  uxtb(tmp, v, ror(16));
+  eor(crc, crc, tmp2);
+  uxtb(tmp2, v, ror(24));
+  ldr(tmp, Address(table1, tmp, lsl(2)));
+  ldr(tmp2, Address(table0, tmp2, lsl(2)));
+  eor(crc, crc, tmp);
+  eor(crc, crc, tmp2);
+}
+
+/**
+ * @param crc   register containing existing CRC (32-bit)
+ * @param buf   register pointing to input byte buffer (byte*)
+ * @param len   register containing number of bytes
+ * @param table register that will contain address of CRC table
+ * @param tmp   scratch register
+ */
+void MacroAssembler::kernel_crc32(Register crc, Register buf, Register len,
+        Register table0, Register table1, Register table2, Register table3,
+        Register tmp, Register tmp2, Register tmp3, int is_crc32c) {
+  Label L_cpu, L_by8_loop, L_by1, L_by1_loop, L_align_by1_loop, L_align_exit, L_exit;
+
+  if (!is_crc32c)
+    inv(crc, crc);
+  if (UseCRC32) {
+    Label CRC_by4_loop, CRC_by1_loop;
+
+      subs(len, len, 4);
+      b(CRC_by4_loop, Assembler::GE);
+      adds(len, len, 4);
+      b(CRC_by1_loop, Assembler::GT);
+      b(L_exit);
+
+    BIND(CRC_by4_loop);
+      ldr(tmp, Address(post(buf, 4)));
+      subs(len, len, 4);
+      if (!is_crc32c)
+        crc32w(crc, crc, tmp);
+      else // is_crc32c
+        crc32cw(crc, crc, tmp);
+      b(CRC_by4_loop, Assembler::GE);
+      adds(len, len, 4);
+      b(L_exit, Assembler::LE);
+    BIND(CRC_by1_loop);
+      ldrb(tmp, Address(post(buf, 1)));
+      subs(len, len, 1);
+      if (!is_crc32c)
+        crc32b(crc, crc, tmp);
+      else // is_crc32c
+        crc32cb(crc, crc, tmp);
+      b(CRC_by1_loop, Assembler::GT);
+    BIND(L_exit);
+      if (!is_crc32c)
+        inv(crc, crc);
+      return;
+  }
+    lea(table0, ExternalAddress(
+        !is_crc32c ?
+            StubRoutines::crc_table_addr() :
+            StubRoutines::crc32c_table_addr() ));
+    add(table1, table0, 1*256*sizeof(juint));
+    add(table2, table0, 2*256*sizeof(juint));
+    add(table3, table0, 3*256*sizeof(juint));
+
+  BIND(L_align_by1_loop);
+    tst(buf, 3);
+    b(L_align_exit, Assembler::EQ);
+    cmp(len, 0);
+    b(L_exit, Assembler::EQ);
+    sub(len, len, 1);
+    ldrb(tmp, Address(post(buf, 1)));
+    update_byte_crc32(crc, tmp, table0);
+    b(L_align_by1_loop);
+
+  BIND(L_align_exit);
+
+  if(VM_Version::features() & FT_AdvSIMD) {
+  if (UseNeon) {
+      cmp(len, 32+12); // account for possible need for alignment
+      b(L_cpu, Assembler::LT);
+
+    Label L_fold, L_align_by4_loop, L_align_by4_exit;
+
+    BIND(L_align_by4_loop);
+      tst(buf, 0xf);
+      b(L_align_by4_exit, Assembler::EQ);
+      ldr(tmp, Address(post(buf, 4)));
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      sub(len, len, 4);
+      b(L_align_by4_loop);
+
+    BIND(L_align_by4_exit);
+
+      add(tmp, table0, 4*256*sizeof(juint)); // Point at the Neon constants
+
+      vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
+      vld1_64(d4, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d5, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d6, post(tmp, 8), Assembler::ALIGN_64);
+      vld1_64(d7, post(tmp, 8), Assembler::ALIGN_64);
+      veor_64(d16, d16, d16);
+      vmov_32(d16, 0, crc);
+
+      veor_64(d0, d0, d16);
+      sub(len, len, 32);
+
+    BIND(L_fold);
+      vmullp_8(q8, d0, d5);
+      vmullp_8(q9, d0, d7);
+      vmullp_8(q10, d0, d4);
+      vmullp_8(q11, d0, d6);
+
+      vmullp_8(q12, d1, d5);
+      vmullp_8(q13, d1, d7);
+      vmullp_8(q14, d1, d4);
+      vmullp_8(q15, d1, d6);
+
+      vuzp_128_16(q9, q8);
+      veor_128(q8, q8, q9);
+
+      vuzp_128_16(q13, q12);
+      veor_128(q12, q12, q13);
+
+      vshll_16u(q9, d16, 8);
+      vshll_16u(q8, d17, 8);
+
+      vshll_16u(q13, d24, 8);
+      vshll_16u(q12, d25, 8);
+
+      veor_128(q8, q8, q10);
+      veor_128(q12, q12, q14);
+      veor_128(q9, q9, q11);
+      veor_128(q13, q13, q15);
+
+      veor_64(d19, d19, d18);
+      veor_64(d18, d27, d26);
+
+      vshll_32u(q13, d18, 16);
+      vshll_32u(q9, d19, 16);
+
+      veor_128(q9, q8, q9);
+      veor_128(q13, q12, q13);
+
+      veor_64(d31, d26, d27);
+      veor_64(d30, d18, d19);
+
+      vshl_128_64(q15, q15, 1);
+      vld1_64(d0, d1, post(buf, 16), Assembler::ALIGN_128);
+      veor_128(q0, q0, q15);
+
+      subs(len, len, 16);
+      b(L_fold, Assembler::GE);
+
+      vmov_32(tmp, d0, 0);
+      mov(crc, 0);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d0, 1);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d1, 0);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+      vmov_32(tmp, d1, 1);
+      update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+
+      add(len, len, 16);
+    }
+  } // if FT_AdvSIMD
+
+  BIND(L_cpu);
+    subs(len, len, 8);
+    b(L_by8_loop, Assembler::GE);
+    adds(len, len, 8);
+    b(L_by1_loop, Assembler::GT);
+    b(L_exit);
+
+  BIND(L_by8_loop);
+    ldr(tmp, Address(post(buf, 4)));
+    update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+    ldr(tmp, Address(post(buf, 4)));
+    update_word_crc32(crc, tmp, tmp2, tmp3, table0, table1, table2, table3);
+    subs(len, len, 8);
+    b(L_by8_loop, Assembler::GE);
+    adds(len, len, 8);
+    b(L_exit, Assembler::LE);
+  BIND(L_by1_loop);
+    subs(len, len, 1);
+    ldrb(tmp, Address(post(buf, 1)));
+    update_byte_crc32(crc, tmp, table0);
+    b(L_by1_loop, Assembler::GT);
+
+  BIND(L_exit);
+    if (!is_crc32c)
+      inv(crc, crc);
+}
+
+/**
+ * First round Key (cpu implementation)
+ * @param in   register containing address of input data (plain or cipher text)
+ * @param key  register containing address of the key data
+ * @param t0   output register t0
+ * @param t1   output register t1
+ * @param t2   output register t2
+ * @param t3   output register t3
+ * @param t4   temporary register
+ * @param t5   temporary register
+ * @param t6   temporary register
+ * @param t7   temporary register
+ */
+void MacroAssembler::kernel_aescrypt_firstRound(Register in, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+
+  ldr(t4, Address(post(key, 4)));
+  ldr(t5, Address(post(key, 4)));
+  ldr(t6, Address(post(key, 4)));
+  ldr(t7, Address(post(key, 4)));
+  ldr(t0, Address(post(in, 4)));
+  ldr(t1, Address(post(in, 4)));
+  ldr(t2, Address(post(in, 4)));
+  ldr(t3, Address(post(in, 4)));
+  rev(t0, t0);
+  rev(t1, t1);
+  rev(t2, t2);
+  rev(t3, t3);
+  eor(t0, t0, t4);
+  eor(t1, t1, t5);
+  eor(t2, t2, t6);
+  eor(t3, t3, t7);
+}
+
+/**
+ * AES ECB Round
+ * @param table_te Register contains address of AES replacement table
+ * @param key   register containing address of the key data
+ * @param t0    Register for input value t0
+ * @param t1    Register for input value t1
+ * @param t2    Register for input value t2
+ * @param t3    Register for input value t3
+ * @param a     Register for output value
+ * @param tmp1  Temporary register 1
+ * @param tmp2  Temporary register 2
+ */
+void MacroAssembler::kernel_aescrypt_round(Register table_te, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register a, Register tmp1, Register tmp2) {
+
+  ldr(a, Address(post(key, 4))); // K
+  uxtb(tmp1, t0, ror(24));
+  ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T1
+  uxtb(tmp2, t1, ror(16));
+  eor(a, a, tmp1);
+  ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T2
+  uxtb(tmp1, t2, ror(8));
+  eor(a, a, tmp2, ror(8));
+  ldr(tmp1, Address(table_te, tmp1, lsl(2))); // T3
+  uxtb(tmp2, t3);
+  eor(a, a, tmp1, ror(16));
+  ldr(tmp2, Address(table_te, tmp2, lsl(2))); // T4
+  eor(a, a, tmp2, ror(24)); // a0
+};
+
+/**
+ *
+ *  Last AES encryption round ( 4 bytes )
+ * @param table_te
+ * @param key
+ * @param to
+ * @param t0
+ * @param t1
+ * @param t2
+ * @param t3
+ * @param t4
+ * @param t5
+ * @param t6
+ * @param t7
+ *
+ *           int tt = K[keyOffset++];
+ *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
+ *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
+ *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
+ *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
+ */
+void MacroAssembler::kernel_aescrypt_lastRound(
+        Register table_te, Register key, Register to,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+
+  ldr(t7, Address(post(key, 4))); // tt
+
+  uxtb(t5, t0, ror(24));
+  ldr(t4, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t1, ror(16));
+  eor(t4, t4, t7, lsr(24));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  uxtb(t5, t2, ror(8));
+  eor(t6, t6, t7, lsr(16));
+  uxtb(t6, t6);
+  add(t4, t4, t6, lsl(8));
+  ldr(t5, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t3);
+  eor(t5, t5, t7, lsr(8));
+  uxtb(t5, t5);
+  add(t4, t4, t5, lsl(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  eor(t6, t6, t7);
+  uxtb(t6, t6);
+  add(t4, t4, t6, lsl(24));
+
+  str(t4, Address(post(to, 4)));
+
+}
+
+/**
+ *
+ *  Last AES encryption round ( 4 bytes )
+ * @param table_te
+ * @param key
+ * @param to
+ * @param t0
+ * @param t1
+ * @param t2
+ * @param t3
+ * @param t4
+ * @param t5
+ * @param t6
+ * @param t7
+ *
+ *           int tt = K[keyOffset++];
+ *           out[outOffset++] = (byte)(S[(t0 >>> 24)       ] ^ (tt >>> 24));
+ *           out[outOffset++] = (byte)(S[(t1 >>> 16) & 0xFF] ^ (tt >>> 16));
+ *           out[outOffset++] = (byte)(S[(t2 >>>  8) & 0xFF] ^ (tt >>>  8));
+ *           out[outOffset++] = (byte)(S[(t3       ) & 0xFF] ^ (tt       ));
+ */
+void MacroAssembler::kernel_aescrypt_lastRound_cbc(
+        Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+
+  uxtb(t5, t0, ror(24));
+  ldr(t4, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t1, ror(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  uxtb(t5, t2, ror(8));
+  add(t4, t4, t6, lsl(8));
+  ldr(t5, Address(table_te, t5, lsl(2))); // S[]
+  uxtb(t6, t3);
+  add(t4, t4, t5, lsl(16));
+  ldr(t6, Address(table_te, t6, lsl(2))); // S[]
+  add(t4, t4, t6, lsl(24));
+}
+
+/**
+ * AES ECB encryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param keylen    register containing key len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_encryptBlock(Register from, Register to,
+        Register key, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+  Label L_loop;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+
+  kernel_aescrypt_firstRound(from, key,
+          t0, t1, t2, t3, t4, t5, t6, t7);
+
+  sub(keylen, keylen, 8);
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t1, t2, t3, t4, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t2, t3, t0, t5, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t3, t0, t1, t6, t7, from);
+
+  uxtb(t7, t3, ror(24));
+  ldr(t3, Address(table_te, t7, lsl(2))); // T1
+  uxtb(t7, t0, ror(16));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T2
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, t7, ror(8));
+  uxtb(t7, t1, ror(8));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, t7, ror(16));
+  uxtb(t7, t2);
+  ldr(t7, Address(table_te, t7, lsl(2))); // T4
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, t7, ror(24));
+  ldr(t7, Address(post(key, 4))); // K
+  eor(t3, t3, t7); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t0, t1, t2, t3,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t1, t2, t3, t0,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t2, t3, t0, t1,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t3, t0, t1, t2,
+          t4, t5, t6, t7);
+}
+
+/**
+ * AES ECB decryption
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param keylen    register containing key len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_decryptBlock(Register from, Register to,
+        Register key, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7) {
+  Label L_loop;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  push(key, sp);
+
+  add(key, key, 16);
+  kernel_aescrypt_firstRound(from, key,
+          t0, t1, t2, t3, t4, t5, t6, t7);
+
+  sub(keylen, keylen, 8);
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t3, t2, t1, t4, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t0, t3, t2, t5, t7, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t1, t0, t3, t6, t7, from);
+
+  uxtb(t7, t3, ror(24));
+  ldr(t3, Address(table_te, t7, lsl(2))); // T1
+  uxtb(t7, t2, ror(16));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T2
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, t7, ror(8));
+  uxtb(t7, t1, ror(8));
+  ldr(t7, Address(table_te, t7, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, t7, ror(16));
+  uxtb(t7, t0);
+  ldr(t7, Address(table_te, t7, lsl(2))); // T4
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, t7, ror(24));
+  ldr(t7, Address(post(key, 4))); // K
+  eor(t3, t3, t7); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  pop(key, sp);
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t0, t3, t2, t1,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t1, t0, t3, t2,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t2, t1, t0, t3,
+          t4, t5, t6, t7);
+
+  kernel_aescrypt_lastRound(
+          table_te, key, to,
+          t3, t2, t1, t0,
+          t4, t5, t6, t7);
+}
+
+/**
+ * AES CBC encryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param rvec      register pointing to roundkey vector
+ * @param len       register containing source len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_encrypt(Register from, Register to,
+        Register key, Register rvec, Register len, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+  Label L_loop, L_loop2;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_te_addr()));
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
+  vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
+  sub(keylen, keylen, 8);
+
+  add(t4, key, keylen, lsl(2));
+  vld1_64(d8, d9, Address(t4), Assembler::ALIGN_STD); // read last key bytes to q4
+  vrev32_128_8(q4, q4);
+
+  push(to, sp);
+  BIND(L_loop2);
+  // get round key and first round
+  vld1_64(d0, d1, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q0
+  veor_128(q0, q0, q2);
+  vrev32_128_8(q0, q0);
+  veor_128(q0, q0, q1);
+  vmov_f64(t0, t1, d0);
+  vmov_f64(t2, t3, d1);
+
+  push(RegSet::of(key, from), sp);
+  push(RegSet::of(to, keylen), sp);
+
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t1, t2, t3, t4, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t2, t3, t0, t5, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t3, t0, t1, t6, to, from);
+
+  uxtb(to, t3, ror(24));
+  ldr(t3, Address(table_te, to, lsl(2))); // T1
+  uxtb(to, t0, ror(16));
+  ldr(to, Address(table_te, to, lsl(2))); // T2
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, to, ror(8));
+  uxtb(to, t1, ror(8));
+  ldr(to, Address(table_te, to, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, to, ror(16));
+  uxtb(to, t2);
+  ldr(to, Address(table_te, to, lsl(2))); // T4
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, to, ror(24));
+  ldr(to, Address(post(key, 4))); // K
+  eor(t3, t3, to); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t0, t1, t2, t3,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t1, t2, t3, t0,
+          t5, t6, from);
+  vmov_f64(d6, t4, t5);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t2, t3, t0, t1,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t3, t0, t1, t2,
+          t5, t6, from);
+  vmov_f64(d7, t4, t5);
+  veor_128(q2, q4, q3);
+
+  pop(RegSet::of(to, keylen), sp);
+  sub(table_te, table_te, 4 * 256); //Te
+  vst1_64(d4, Address(post(to, 8)), Assembler::ALIGN_STD);
+  pop(RegSet::of(key, from), sp);
+  vst1_64(d5, Address(post(to, 8)), Assembler::ALIGN_STD);
+
+  subs(len, len, 16);
+  b(L_loop2, Assembler::NE);
+  vstr_f64(d4, Address(rvec));
+  vstr_f64(d5, Address(rvec, 8));
+  mov(r0, to);
+  pop(to, sp);
+  sub(r0, r0, to);
+};
+
+/**
+ * AES CBC decryption
+ *
+ * @param from      register pointing to source array address
+ * @param to        register pointing to destination array address
+ * @param key       register pointing to key
+ * @param rvec      register pointing to roundkey vector
+ * @param len       register containing source len in bytes
+ */
+void MacroAssembler::kernel_aescrypt_decrypt(Register from, Register to,
+        Register key, Register rvec, Register len, Register keylen, Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6) {
+  Label L_loop, L_loop2;
+  lea(table_te, ExternalAddress(StubRoutines::aes_table_td_addr()));
+
+  ldr(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() -
+          arrayOopDesc::base_offset_in_bytes(T_INT)));
+
+  vld1_64(d2, d3, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q1
+  vld1_64(d4, d5, Address(rvec), Assembler::ALIGN_STD); // read rvec bytes to q2
+  vld1_64(d10, d11, Address(post(key, 16)), Assembler::ALIGN_STD); // read key to q5
+  vrev32_128_8(q1, q1);
+  sub(keylen, keylen, 8);
+
+  push(to, sp);
+  BIND(L_loop2);
+  // get round key and first round
+  vld1_64(d8, d9, Address(post(from, 16)), Assembler::ALIGN_STD); // read 16 bytes to q4
+
+  push(RegSet::of(to, key, from, keylen), sp);
+  vrev32_128_8(q0, q4);
+  veor_128(q0, q0, q5);
+  vmov_f64(t0, t1, d0);
+  vmov_f64(t2, t3, d1);
+
+  BIND(L_loop);
+
+  kernel_aescrypt_round(table_te, key,
+          t0, t3, t2, t1, t4, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t1, t0, t3, t2, t5, to, from);
+  kernel_aescrypt_round(table_te, key,
+          t2, t1, t0, t3, t6, to, from);
+
+  uxtb(to, t3, ror(24));
+  ldr(t3, Address(table_te, to, lsl(2))); // T1
+  uxtb(to, t2, ror(16));
+  ldr(to, Address(table_te, to, lsl(2))); // T2
+  mov(t2, t6); // t2=a2
+  eor(t3, t3, to, ror(8));
+  uxtb(to, t1, ror(8));
+  ldr(to, Address(table_te, to, lsl(2))); // T3
+  mov(t1, t5); // t1=a1
+  eor(t3, t3, to, ror(16));
+  uxtb(to, t0);
+  ldr(to, Address(table_te, to, lsl(2))); // T4
+  mov(t0, t4); // t0=a0
+  eor(t3, t3, to, ror(24));
+  ldr(to, Address(post(key, 4))); // K
+  eor(t3, t3, to); // t3 = a3
+
+  subs(keylen, keylen, 4);
+  b(L_loop, Assembler::NE);
+
+  // last round is special
+  add(table_te, table_te, 4 * 256); //S
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t0, t3, t2, t1,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t1, t0, t3, t2,
+          t5, t6, to);
+  vmov_f64(d6, t4, t5); //q3
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t2, t1, t0, t3,
+          t4, t5, t6);
+
+  kernel_aescrypt_lastRound_cbc(
+          table_te,
+          t3, t2, t1, t0,
+          t5, t6, to);
+  vmov_f64(d7, t4, t5); //q3
+  pop(RegSet::of(to, key, from, keylen), sp);
+  veor_128(q3, q1, q3);
+  veor_128(q3, q3, q2);
+  vshl_128_64(q2, q4, 0);
+
+  sub(table_te, table_te, 4 * 256); //Te
+
+  vst1_64(d6, Address(post(to, 8)), Assembler::ALIGN_STD);
+  subs(len, len, 16);
+  vst1_64(d7, Address(post(to, 8)), Assembler::ALIGN_STD);
+
+  b(L_loop2, Assembler::NE);
+
+  vstr_f64(d4, Address(rvec));
+  vstr_f64(d5, Address(rvec, 8));
+  mov(r0, to);
+  pop(to, sp);
+  sub(r0, r0, to);
+};
+
+/*
+ * First round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round1(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    eor(st_f, st_d, st_c, ror(32-sh));
+  } else {
+    eor(st_f, st_d, st_c);
+  }
+  andr(st_f, st_f, st_b);
+  eor(st_f, st_f, st_d);
+}
+
+/*
+ * Second and forth round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round2(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    eor(st_f, st_b, st_c, ror(32-sh));
+  } else {
+    eor(st_f, st_b, st_c);
+  }
+  eor(st_f, st_f, st_d);
+}
+
+/*
+ * Third round of SHA1 algorithm
+ */
+void MacroAssembler::sha_round3(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh) {
+  if (sh) {
+    andr(st_f, st_b, st_c, ror(32-sh));
+    orr(tmp, st_b, st_c, ror(32-sh));
+  } else {
+    andr(st_f, st_b, st_c);
+    orr(tmp, st_b, st_c);
+  }
+  andr(tmp, st_d, tmp);
+  orr(st_f, st_f, tmp);
+}
+
+/*
+ * Calculate Deltas w[i] and w[i+1]
+ * w[i] = (w[i-3] xor w[i-8] xor w[i-14] xor w[i-16]) rotl 1
+ */
+void MacroAssembler::sha_w0(FloatRegister w16, FloatRegister w14,
+        FloatRegister w8, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, bool update) {
+  vadd_64_32(st_kw, st_k, w16);
+  if(update) {
+    veor_64(tmp1, w16, w14);
+    vext_64(tmp2, w2, w4, 4);
+    veor_64(tmp3, tmp1, w8);
+    veor_64(tmp4, tmp3, tmp2);
+
+    vshr_64_u32(tmp1, tmp4, 31);
+    vshl_64_32(tmp2, tmp4, 1);
+    vorr_64(w16, tmp1, tmp2);
+  }
+}
+/*
+ * Calculate Deltas w[i] and w[i+1]
+ */
+void MacroAssembler::sha_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp,
+        bool update) {
+  Label L_7, L_6, L_5, L_4, L_3, L_2, L_1, L_done;
+  andr(rtmp, counter, 0x7);
+  add(counter, counter, 1);
+  cmp(rtmp, 7);
+  b(L_7, Assembler::EQ);
+  cmp(rtmp, 6);
+  b(L_6, Assembler::EQ);
+  cmp(rtmp, 5);
+  b(L_5, Assembler::EQ);
+  cmp(rtmp, 4);
+  b(L_4, Assembler::EQ);
+  cmp(rtmp, 3);
+  b(L_3, Assembler::EQ);
+  cmp(rtmp, 2);
+  b(L_2, Assembler::EQ);
+  cmp(rtmp, 1);
+  b(L_1, Assembler::EQ);
+    sha_w0(w16, w14, w8, w4, w2, tmp1, tmp2, tmp3, tmp4, st_k, st_kw, update);
+    b(L_done);
+  BIND(L_1); {
+    sha_w0(w14, w12, w6, w2, w16, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_2); {
+    sha_w0(w12, w10, w4, w16, w14, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_3); {
+    sha_w0(w10, w8, w2, w14, w12, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_4); {
+    sha_w0(w8, w6, w16, w12, w10, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_5); {
+    sha_w0(w6, w4, w14, w10, w8, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_6); {
+    sha_w0(w4, w2, w12, w8, w6, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+    b(L_done);
+  }
+  BIND(L_7); {
+    sha_w0(w2, w16, w10, w6, w4, tmp1, tmp2, tmp3, tmp4,  st_k, st_kw, update);
+  }
+  BIND(L_done);
+}
+
+/**
+ * SHA1 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register st_a, Register st_b,
+        Register st_c, Register st_d, Register st_e,
+        Register tmp, Register counter2, Register st_new_a, Register st_w) {
+  Label L_round_1, L_round_2, L_round_3, L_round_4, L_round_4_cont, L_hash_no_w;
+
+  FloatRegister w16 = d0;  //q0-q7
+  FloatRegister w14 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w12 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w10 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w8  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w6  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w4  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w2  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp1  = w2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp2  = wtmp1->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp3  = wtmp2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister wtmp4  = wtmp3->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k1  = wtmp4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k2  = st_k1->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_k   = st_k2->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_kw  = st_k->successor(FloatRegisterImpl::DOUBLE);
+
+
+  assert_different_registers(st_a,st_b,st_c,st_d,st_e,tmp,counter2, st_new_a, st_w);
+  assert_different_registers(w2,w4,w6,w8,w10,w12,w14,w16);
+
+  lea(table_k, ExternalAddress(StubRoutines::sha1_table_addr()));
+
+  // read initial 16 W elements
+  vld1_64(w16,  w14,  w12,  w10,  Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w8,   w6,   w4,   w2,   Address(from), Assembler::ALIGN_STD);
+
+  // revert W
+  vrev64_128_8(w16, w16);
+  vrev64_128_8(w12, w12);
+  vrev64_128_8(w8,  w8);
+  vrev64_128_8(w4,  w4);
+  // load state
+  ldr(st_a, Address(post(state, 4)));
+  ldr(st_b, Address(post(state, 4)));
+  ldr(st_c, Address(post(state, 4)));
+  ldr(st_d, Address(post(state, 4)));
+  ldr(st_e, Address(state));
+  sub(state, state, 16);
+
+  mov(counter2, 0);
+  mov(counter, 10);
+  // first round
+  vld1_64(st_k1, st_k2, Address(table_k), Assembler::ALIGN_128);
+  vdup_64_32(st_k, st_k1, 0);
+
+  BIND(L_round_1); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round1(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round1(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_1);
+
+  mov(counter, 10);
+  // second round
+  vdup_64_32(st_k, st_k1, 1);
+
+  BIND(L_round_2); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_2);
+
+  mov(counter, 10);
+  vdup_64_32(st_k, st_k2, 0);
+  // third round
+
+  BIND(L_round_3); {
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+
+    sha_round3(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round3(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+    sub(counter, counter, 1);
+  }cbnz(counter, L_round_3);
+
+  mov(counter, 10);
+  // forth round
+  vdup_64_32(st_k, st_k2, 1);
+
+  BIND(L_round_4); {
+    sub(counter, counter, 1);
+    cmp(counter, 8);
+    b(L_hash_no_w, Assembler::LO);
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp);
+    b(L_round_4_cont);
+    BIND(L_hash_no_w);
+    sha_w(w16, w14, w12, w10, w8, w6, w4, w2, wtmp1, wtmp2, wtmp3, wtmp4, st_k, st_kw, counter2, tmp, false);
+    BIND(L_round_4_cont);
+
+    sha_round2(st_b, st_c, st_d, tmp, st_new_a, 0);
+    vmov_32(st_w, st_kw, 1);
+    add(st_new_a, st_new_a, st_a, ror(32-5));
+    add(st_new_a, st_new_a, st_e);
+    add(st_new_a, st_new_a, st_w);
+
+    vmov_32(st_w, st_kw, 0);
+    sha_round2(st_a, st_b, st_c, tmp, st_e, 30);
+
+    add(tmp, st_e, st_new_a, ror(32-5));
+    add(tmp, tmp, st_d);
+
+    mov(st_e, st_c);
+    mov(st_d, st_b, ror(32-30));
+    mov(st_c, st_a, ror(32-30));
+    mov(st_b, st_new_a);
+    add(st_a, tmp, st_w);
+
+  }cbnz(counter, L_round_4);
+
+    // load state
+  ldr(tmp, Address(post(state, 4)));
+  add(st_a, st_a, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_b, st_b, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_c, st_c, tmp);
+  ldr(tmp, Address(post(state, 4)));
+  add(st_d, st_d, tmp);
+  ldr(tmp, Address(state));
+  add(st_e, st_e, tmp);
+  sub(state, state, 16);
+
+  // save state
+  str(st_a, Address(post(state, 4)));
+  str(st_b, Address(post(state, 4)));
+  str(st_c, Address(post(state, 4)));
+  str(st_d, Address(post(state, 4)));
+  str(st_e, Address(state));
+}
+/**
+ * One iteration of SHA256 algorithm
+ * Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
+ * Ma := (a and b) xor (a and c) xor (b and c)
+ * t2 := Σ0 + Ma
+ * Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
+ * Ch := (e and f) xor ((not e) and g)
+ * t1 := h + Σ1 + Ch + k[i] + w[i]
+ * h := g
+ * g := f
+ * f := e
+ * e := d + t1
+ * d := c
+ * c := b
+ * b := a
+ * a := t1 + t2
+ */
+void MacroAssembler::sha256_implCompress_iter0(
+      Register Da, Register Db, Register Dc, Register Dd,
+      Register De, Register Df, Register Dg, Register Dh,
+      FloatRegister Dkw, int index,
+      Register Dtmp,
+      Register Dnew_a, Register Dnew_e
+        ) {
+    assert_different_registers(Da, Db, Dc, Dd, De, Df, Dg, Dh);
+
+    //  Σ0 := (a rotr 2) xor (a rotr 13) xor (a rotr 22)
+    //  Σ1 := (e rotr 6) xor (e rotr 11) xor (e rotr 25)
+    andr(Dnew_a, Da, Db);
+    andr(Dnew_e, Da, Dc);
+    eor(Dnew_a, Dnew_a, Dnew_e);
+    andr(Dnew_e, Db, Dc);
+    eor(Dnew_e, Dnew_a, Dnew_e); //Ma
+
+    mov(Dnew_a, Da, ror(2));
+    eor(Dnew_a, Dnew_a, Da, ror(13));
+    eor(Dnew_a, Dnew_a, Da, ror(22)); //Σ0
+
+    add(Dnew_a, Dnew_a, Dnew_e); //t2
+
+    andr(Dnew_e, De, Df);
+    mvn(Dtmp, De);
+    andr(Dtmp, Dtmp, Dg);
+    eor(Dtmp, Dnew_e, Dtmp); //Ch
+
+    mov(Dnew_e, De, ror(6));
+    eor(Dnew_e, Dnew_e, De, ror(11));
+    eor(Dnew_e, Dnew_e, De, ror(25)); //Σ1
+
+    add(Dnew_e, Dnew_e, Dtmp);
+    vmov_32(Dtmp, Dkw, index);
+    add(Dnew_e, Dnew_e, Dh);
+
+    add(Dtmp, Dnew_e, Dtmp); //t1
+
+    add(Dnew_e, Dtmp, Dd); //new_e
+    add(Dnew_a, Dtmp, Dnew_a); //new_a
+};
+/**
+ * Four iterations of SHA256 algorithm
+ */
+void MacroAssembler::sha256_implCompress_iter(
+      Register ra, Register rb, Register rc, Register rd,
+      Register re, Register rf, Register rg, Register rh,
+      FloatRegister Dkw1, FloatRegister Dkw2,
+      Register step,
+      Register tmp,
+      Register ra2, Register re2
+        ) {
+  Label L_4, L_3, L_2, L_1, L_done;
+  cmp(step, 4);
+  b(L_4, Assembler::EQ);
+  cmp(step, 3);
+  b(L_3, Assembler::EQ);
+  cmp(step, 2);
+  b(L_2, Assembler::EQ);
+  cmp(step, 1);
+  b(L_1, Assembler::EQ);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 0, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 1, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 0, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 1, tmp, rb,  rf);
+    mov(step, 4);
+    b(L_done);
+  BIND(L_1); {
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw1, 0, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 1, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw2, 0, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 1, tmp, ra,  re);
+    mov(step, 0);
+    b(L_done);
+  }
+  BIND(L_2); {
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw1, 0, tmp, rc,  rg);
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 1, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw2, 0, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 1, tmp, ra2, re2);
+    mov(step, 1);
+    b(L_done);
+  }
+  BIND(L_3); {
+    sha256_implCompress_iter0(rc,  rd,  ra2, ra,  rg,  rh,  re2, re,  Dkw1, 0, tmp, rb,  rf);
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 1, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw2, 0, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 1, tmp, rd,  rh);
+    mov(step, 2);
+    b(L_done);
+  }
+  BIND(L_4); {
+    sha256_implCompress_iter0(rb,  rc,  rd,  ra2, rf,  rg,  rh,  re2, Dkw1, 0, tmp, ra,  re);
+    sha256_implCompress_iter0(ra,  rb,  rc,  rd,  re,  rf,  rg,  rh,  Dkw1, 1, tmp, ra2, re2);
+    sha256_implCompress_iter0(ra2, ra,  rb,  rc,  re2, re,  rf,  rg,  Dkw2, 0, tmp, rd,  rh);
+    sha256_implCompress_iter0(rd,  ra2, ra,  rb,  rh,  re2, re,  rf,  Dkw2, 1, tmp, rc,  rg);
+    mov(step, 3);
+  }
+  BIND(L_done);
+};
+
+  /*
+   * Calculate Deltas w[i] and w[i+1]
+   * s0 := (w[i-15] rotr 7) xor (w[i-15] rotr 18) xor (w[i-15] shr 3)
+   * s1 := (w[i-2] rotr 17) xor (w[i-2] rotr 19) xor (w[i-2] shr 10)
+   * w[i] := w[i-16] + s0 + w[i-7] + s1
+   */
+void MacroAssembler::sha256_w0(
+      FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14,
+      FloatRegister w_m7, FloatRegister w_m6,
+      FloatRegister w_m2,
+      FloatRegister Qtmp_S0, FloatRegister Qtmp_S1,
+      FloatRegister Qtmp1){
+
+    vmov_64(Qtmp1, w_m15);
+    vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m14);
+    vshr_128_u64(Qtmp_S0, Qtmp1, 7);
+    vshr_128_u64(Qtmp_S1, Qtmp1, 18);
+    veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);
+    vshr_128_u64(Qtmp_S1, Qtmp1, 35);
+    veor_128(Qtmp_S0, Qtmp_S0, Qtmp_S1);  //S0
+
+    vshr_128_u64(Qtmp_S1, w_m2, 17);
+    vshr_128_u64(Qtmp1, w_m2, 19);
+    veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);
+    vshr_128_u64(Qtmp1, w_m2, 42);
+    veor_128(Qtmp_S1, Qtmp_S1, Qtmp1);  //S1
+
+    vmov_64(Qtmp1, w_m7);
+    vmov_64(Qtmp1->successor(FloatRegisterImpl::DOUBLE), w_m6);
+    vadd_128_32(Qtmp1, Qtmp1, w_m16);
+    vadd_128_32(Qtmp1, Qtmp1, Qtmp_S0);
+    vadd_128_32(w_m16, Qtmp1, Qtmp_S1); // w[i/i+1]
+
+    vdup_64_32(w_m16, w_m16, 0);
+    vdup_64_32(w_m15, w_m15, 0);
+}
+
+/*
+ * Calculate Deltas w[i] ... w[i+3]
+ */
+void MacroAssembler::sha256_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3,
+        FloatRegister st_kw, Register counter, Register rtmp) {
+  FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister Dtmp1  = as_FloatRegister(tmp1->encoding());
+  FloatRegister Dtmp2  = Dtmp1->successor(FloatRegisterImpl::DOUBLE);
+  Label L_3, L_2, L_1, L_done;
+
+  andr(rtmp, counter, 0x3);
+  cmp(rtmp, 3);
+  b(L_3, Assembler::EQ);
+  cmp(rtmp, 2);
+  b(L_2, Assembler::EQ);
+  cmp(rtmp, 1);
+  b(L_1, Assembler::EQ);
+    vext_64(Dtmp1, w16, w15, 4);
+    vext_64(Dtmp2, w14, w13, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w16, w15, w14, w7,  w6,  w2,  tmp1, tmp2, tmp3);
+    sha256_w0(w14, w13, w12, w5,  w4,  w16, tmp1, tmp2, tmp3);
+    b(L_done);
+  BIND(L_3); {
+    vext_64(Dtmp1, w12, w11, 4);
+    vext_64(Dtmp2, w10, w9,  4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w12, w11, w10, w3,  w2,  w14, tmp1, tmp2, tmp3);
+    sha256_w0(w10, w9,  w8,  w1,  w16, w12, tmp1, tmp2, tmp3);
+    b(L_done);
+  }
+  BIND(L_2); {
+    vext_64(Dtmp1, w8, w7, 4);
+    vext_64(Dtmp2, w6, w5, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w8,  w7,  w6,  w15, w14, w10, tmp1, tmp2, tmp3);
+    sha256_w0(w6,  w5,  w4,  w13, w12, w8,  tmp1, tmp2, tmp3);
+    b(L_done);
+  }
+  BIND(L_1); {
+    vext_64(Dtmp1, w4, w3, 4);
+    vext_64(Dtmp2, w2, w1, 4);
+    vadd_128_32(st_kw, st_kw, tmp1);
+    cmp(counter, 3);
+    b(L_done, Assembler::LO);
+    sha256_w0(w4,  w3,  w2,  w11, w10, w6,  tmp1, tmp2, tmp3);
+    sha256_w0(w2,  w1,  w16, w9,  w8,  w4,  tmp1, tmp2, tmp3);
+  }
+  BIND(L_done);
+}
+
+/**
+ * SHA256 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha256_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register ra, Register rb, Register rc, Register rd, Register re,
+        Register rf, Register rg, Register rh,
+        Register ra2, Register re2) {
+
+    Label L_hash_loop, L_hash_loop_done, L_hash_no_w;
+    lea(table_k, ExternalAddress(StubRoutines::sha256_table_addr()));
+
+    // read next k
+    vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
+    // read initial 16 W elements in q8-q11
+    vld1_64(d16, d17, d18, d19, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
+    vld1_64(d20, d21, d22, d23, Address(post(from, 32)), Assembler::ALIGN_STD); // read from
+    // revert W
+    vrev32_128_8(q8,  q8);
+    vrev32_128_8(q9,  q9);
+    vrev32_128_8(q10, q10);
+    vrev32_128_8(q11, q11);
+
+    vadd_128_32(q7, q7, q8); // k + w
+
+    vdup_64_32(d31, d23, 1);  //w1
+    vdup_64_32(d30, d23, 0);  //w2
+    vdup_64_32(d29, d22, 1);  //w3
+    vdup_64_32(d28, d22, 0);  //w4
+    vdup_64_32(d27, d21, 1);  //w5
+    vdup_64_32(d26, d21, 0);  //w6
+    vdup_64_32(d25, d20, 1);  //w7
+    vdup_64_32(d24, d20, 0);  //w8
+    vdup_64_32(d23, d19, 1);  //w9
+    vdup_64_32(d22, d19, 0);  //w10
+    vdup_64_32(d21, d18, 1);  //w11
+    vdup_64_32(d20, d18, 0);  //w12
+    vdup_64_32(d19, d17, 1);  //w13
+    vdup_64_32(d18, d17, 0);  //w14
+    vdup_64_32(d17, d16, 1);  //w15
+    vdup_64_32(d16, d16, 0);  //w16
+
+    mov(counter, 16);
+    // load state
+    push(state, sp);
+    ldr(ra, Address(post(state, 4)));
+    ldr(rb, Address(post(state, 4)));
+    ldr(rc, Address(post(state, 4)));
+    ldr(rd, Address(post(state, 4)));
+    ldr(re, Address(post(state, 4)));
+    ldr(rf, Address(post(state, 4)));
+    ldr(rg, Address(post(state, 4)));
+    ldr(rh, Address(state));
+
+    const Register tmp = from;
+    const Register step = state;
+
+    // calculate deltas
+    sha256_w0(d16, d17, d18, d25,  d26,  d30, q0, q1, q2);
+    sha256_w0(d18, d19, d20, d27,  d28,  d16, q0, q1, q2);
+
+    mov(step, 0); // use state for internal counter
+    sub(counter, counter, 1);
+
+    sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
+        step,
+        tmp, ra2, re2);
+
+    BIND(L_hash_loop); {
+      // read next k
+      vld1_64(d14, d15, Address(post(table_k, 16)), Assembler::ALIGN_128);
+      //calculate deltas
+      sha256_w(q8, q9, q10, q11, q12, q13, q14, q15,
+        q0, q1, q2,
+        q7,
+        counter, tmp);
+
+      //calculate state
+      sha256_implCompress_iter(ra, rb, rc, rd, re, rf, rg, rh, d14, d15,
+        step,
+        tmp, ra2, re2);
+      sub(counter, counter, 1);
+    } cbnz(counter, L_hash_loop);
+
+    pop(state, sp);
+
+    // load initial state and add to current state
+    ldr(tmp, Address(post(state, 4)));
+    add(rb, rb, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rc, rc, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rd, rd, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(ra2, ra2, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rf, rf, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rg, rg, tmp);
+    ldr(tmp, Address(post(state, 4)));
+    add(rh, rh, tmp);
+    ldr(tmp, Address(state));
+    add(re2, re2, tmp);
+    sub(state, state, 28);
+
+    // save state
+    str(rb,  Address(post(state, 4)));
+    str(rc,  Address(post(state, 4)));
+    str(rd,  Address(post(state, 4)));
+    str(ra2, Address(post(state, 4)));
+    str(rf,  Address(post(state, 4)));
+    str(rg,  Address(post(state, 4)));
+    str(rh,  Address(post(state, 4)));
+    str(re2, Address(post(state, 4)));
+}
+
+/**
+ * SHA512 Sigma
+ * Sigma(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR ROTR(x, sh3)
+ */
+void MacroAssembler::sha512_sigma(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3) {
+  FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
+  FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(x, Dtmp0, Dtmp1, Dsigma);
+
+  vshr_64_u64(Dtmp0, x, sh1);
+  vshl_64_64(Dtmp1, x, 64-sh1);
+  vorr_64(Dsigma, Dtmp0, Dtmp1);
+
+  vshr_64_u64(Dtmp0, x, sh2);
+  vshl_64_64(Dtmp1, x, 64-sh2);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Dsigma, Dsigma, Dtmp0);
+
+  vshr_64_u64(Dtmp0, x, sh3);
+  vshl_64_64(Dtmp1, x, 64-sh3);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Dsigma, Dsigma, Dtmp0);
+}
+
+/**
+ * SHA512 Delta
+ * Delta(x) = ROTR(x, sh1) XOR ROTR(x, sh2) XOR SHR(x, sh3)
+ */
+void MacroAssembler::sha512_delta(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3) {
+  FloatRegister Dtmp0 = as_FloatRegister(Qtmp->encoding());
+  FloatRegister Dtmp1 = Dtmp0->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(x, Dtmp0, Dtmp1, Ddelta);
+
+  vshr_64_u64(Dtmp0, x, sh1);
+  vshl_64_64(Dtmp1, x, 64-sh1);
+  vorr_64(Ddelta, Dtmp0, Dtmp1);
+
+  vshr_64_u64(Dtmp0, x, sh2);
+  vshl_64_64(Dtmp1, x, 64-sh2);
+  vorr_64(Dtmp0, Dtmp0, Dtmp1);
+
+  veor_64(Ddelta, Ddelta, Dtmp0);
+
+  vshr_64_u64(Dtmp0, x, sh3);
+
+  veor_64(Ddelta, Ddelta, Dtmp0);
+}
+
+/**
+ * SHA512 Ch
+ * Ch(x, y, z) = (x AND y) XOR ( NOT x AND z)
+ */
+void MacroAssembler::sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dch) {
+  assert_different_registers(x, Dtmp, Dch);
+
+  vmvn_64(Dtmp, x);
+  vand_64(Dtmp, Dtmp, z);
+
+  vand_64(Dch, x, y);
+  veor_64(Dch, Dtmp, Dch);
+}
+
+/**
+ * SHA512 Maj
+ * Maj(x, y, z) = (x AND y) XOR (x AND z) XOR (y AND z)
+ */
+void MacroAssembler::sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dmaj) {
+  assert_different_registers(x, Dtmp, Dmaj);
+
+  vand_64(Dmaj, x, y);
+  vand_64(Dtmp, x, z);
+  veor_64(Dmaj, Dmaj, Dtmp);
+  vand_64(Dtmp, y, z);
+  veor_64(Dmaj, Dmaj, Dtmp);
+}
+
+/**
+ * SHA512 digest
+ *
+ * @param from      register pointing to source array address
+ * @param state     register pointing to state array address
+ */
+void MacroAssembler::kernel_sha512_implCompress(Register from, Register state,
+        Register counter, Register table_k) {
+  Label L_hash_loop, L_hash_no_w;
+  FloatRegister st_a = d18;  //q9-q12
+  FloatRegister st_b = st_a->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_c = st_b->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_d = st_c->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_e = st_d->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_f = st_e->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_g = st_f->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister st_h = st_g->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister w16 = d0;  //q0-q7
+  FloatRegister w15 = w16->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w14 = w15->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w13 = w14->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w12 = w13->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w11 = w12->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w10 = w11->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w9  = w10->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w8  = w9->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w7  = w8->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w6  = w7->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w5  = w6->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w4  = w5->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w3  = w4->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w2  = w3->successor(FloatRegisterImpl::DOUBLE);
+  FloatRegister w1  = w2->successor(FloatRegisterImpl::DOUBLE);
+
+  FloatRegister t1  = d26;
+  FloatRegister t2  = d27;
+  FloatRegister new_a = st_h;
+  FloatRegister new_e = st_d;
+  FloatRegister new_new_a = st_g;
+  FloatRegister new_new_e = st_c;
+
+  FloatRegister w0  = w1->successor(FloatRegisterImpl::DOUBLE);
+  assert_different_registers(st_a,st_b,st_c,st_d,st_e,st_f,st_g,st_h);
+  assert_different_registers(w0,w1,w2,w3,w4,w5,w6,w7);
+  assert_different_registers(w8,w9,w10,w11,w12,w13,w14,w15,w16);
+
+  lea(table_k, ExternalAddress(StubRoutines::sha512_table_addr()));
+
+  // read initial 16 W elements
+  vld1_64(w16,  w15,  w14,  w13,  Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w12,  w11,  w10,  w9,   Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w8,   w7,   w6,   w5,   Address(post(from, 32)), Assembler::ALIGN_STD);
+  vld1_64(w4,   w3,   w2,   w1,   Address(from),           Assembler::ALIGN_STD);
+  // read initial state to a,b,c,d,e,f,g,h
+  vld1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vld1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
+  sub(state, state, 32);
+
+  // revert W
+  vrev64_128_8(w16, w16);
+  vrev64_128_8(w14, w14);
+  vrev64_128_8(w12, w12);
+  vrev64_128_8(w10, w10);
+  vrev64_128_8(w8,  w8);
+  vrev64_128_8(w6,  w6);
+  vrev64_128_8(w4,  w4);
+  vrev64_128_8(w2,  w2);
+
+
+  mov(counter, 40);
+  BIND(L_hash_loop); {
+    sub(counter, counter, 1);
+    // first iteration
+    // calculate T1
+    // read K
+    vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
+    vadd_64_64(d31, st_h, w16);
+    sha512_ch(st_e, st_f, st_g, t2, t1);
+    sha512_sigma(st_e, q14, t2, 14, 18, 41);
+    vadd_128_64(q13, q13, q15);
+    vadd_64_64(t1, t1, t2);
+
+    // calculate T2
+    sha512_maj(st_a, st_b, st_c, d30, d31);
+    sha512_sigma(st_a, q14, t2, 28, 34, 39);
+    vadd_64_64(t2, t2, d31);
+
+    vadd_64_64(new_a, t1, t2);
+    vadd_64_64(new_e, st_d,  t1);
+
+    // second iteration
+    // calculate T1
+    // read K
+    vld1_64(d30, Address(post(table_k, 8)), Assembler::ALIGN_64);
+    vadd_64_64(d31, st_g, w15);
+    sha512_ch(new_e, st_e, st_f, t2, t1);
+    sha512_sigma(new_e, q14, t2, 14, 18, 41);
+    vadd_128_64(q13, q13, q15);
+    vadd_64_64(t1, t1, t2);
+
+    // calculate T2
+    sha512_maj(new_a, st_a, st_b, d30, d31);
+    sha512_sigma(new_a, q14, t2, 28, 34, 39);
+    vadd_64_64(t2, t2, d31);
+
+    vadd_64_64(new_new_a, t1, t2);
+    vadd_64_64(new_new_e, st_c,  t1);
+
+    // restore a,b,c,d,e,f,g,h sequence
+    vswp_128(st_g, st_a);
+    vswp_128(st_g, st_c);
+    vswp_128(st_g, st_e);
+
+    cmp(counter, 8);
+    b(L_hash_no_w, Assembler::LO);
+
+    // calculate W[+1], W[+2]
+    sha512_delta(w15, q14, t1, 1, 8, 7);
+    sha512_delta(w2,  q14, d30, 19, 61, 6);
+    sha512_delta(w14, q14, t2, 1, 8, 7);
+    sha512_delta(w1,  q14, d31, 19, 61, 6);
+
+    vadd_128_64(w16, w16, t1);
+    vadd_128_64(w16, w16, q15);
+    vadd_64_64(w16, w16, w7);
+    vadd_64_64(w15, w15, w6);
+
+    BIND(L_hash_no_w);
+
+    vswp_128(w16, w14);
+    vswp_128(w14, w12);
+    vswp_128(w12, w10);
+    vswp_128(w10, w8);
+    vswp_128(w8,  w6);
+    vswp_128(w6,  w4);
+    vswp_128(w4,  w2);
+  } cbnz(counter, L_hash_loop);
+  // read initial state to w16 - w9
+  vld1_64(w16, w15, w14, w13, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vld1_64(w12, w11, w10, w9,  Address(state),           Assembler::ALIGN_STD);
+  sub(state, state, 32);
+
+  // update state
+  vadd_128_64(st_a, st_a, w16);
+  vadd_128_64(st_c, st_c, w14);
+  vadd_128_64(st_e, st_e, w12);
+  vadd_128_64(st_g, st_g, w10);
+
+  // store state
+  vst1_64(st_a, st_b, st_c, st_d, Address(post(state, 32)), Assembler::ALIGN_STD);
+  vst1_64(st_e, st_f, st_g, st_h, Address(state),           Assembler::ALIGN_STD);
+}
+
+void MacroAssembler::bfc_impl(Register Rd, int lsb, int width, Condition cond) {
+  if (width > 15 && lsb == 0) {
+    lsr(Rd, Rd, width);
+    lsl(Rd, Rd, width);
+  } else if (width > 15 && lsb + width == 32) {
+    lsl(Rd, Rd, 32 - lsb);
+    lsr(Rd, Rd, 32 - lsb);
+  } else {
+    const int lsb1 = (lsb & 1);
+    int w1 = width <= 8 - lsb1 ? width : 8 - lsb1;
+    while (width) {
+      bic(Rd, Rd, ((1 << w1) - 1) << lsb);
+      width -= w1;
+      lsb += w1;
+      w1 = width > 8 ? 8 : width;
+    }
+  }
+}
+
+// get_thread can be called anywhere inside generated code so we need
+// to save whatever non-callee save context might get clobbered by the
+// call to the C thread_local lookup call or, indeed, the call setup
+// code. x86 appears to save C arg registers.
+
+void MacroAssembler::get_thread(Register dst) {
+  // call pthread_getspecific
+  // void * pthread_getspecific(pthread_key_t key);
+
+  // Save all call-clobbered regs except dst, plus rscratch1 and rscratch2.
+  RegSet saved_regs = RegSet::range(r0, r3) + rscratch1 + rscratch2 + lr - dst;
+  push(saved_regs, sp);
+
+  // Align stack and save value for return
+  mov(c_rarg1, sp);
+  sub(sp, sp, wordSize);
+  bic(sp, sp, 7);
+  str(c_rarg1, Address(sp));
+
+  mov(rscratch2, CAST_FROM_FN_PTR(address, Thread::current));
+
+  bl(rscratch2);
+  //undo alignment
+  ldr(sp, Address(sp));
+
+  if (dst != c_rarg0) {
+    mov(dst, c_rarg0);
+  }
+
+  // restore pushed registers
+  pop(saved_regs, sp);
+}
+
+#ifdef COMPILER2
+// 24-bit word range == 26-bit byte range
+bool check26(int offset) {
+  // this could be simplified, but it mimics encoding and decoding
+  // an actual branch insrtuction
+  int off1 = offset << 6 >> 8;
+  int encoded = off1 & ((1<<24)-1);
+  int decoded = encoded << 8 >> 6;
+  return offset == decoded;
+}
+
+// Perform some slight adjustments so the default 32MB code cache
+// is fully reachable.
+static inline address first_cache_address() {
+  return CodeCache::low_bound() + sizeof(HeapBlock::Header);
+}
+static inline address last_cache_address() {
+  return CodeCache::high_bound() - NativeInstruction::arm_insn_sz;
+}
+
+// Can we reach target using unconditional branch or call from anywhere
+// in the code cache (because code can be relocated)?
+bool MacroAssembler::_reachable_from_cache(address target) {
+#ifdef __thumb__
+  if ((1 & (intptr_t)target) != 0) {
+    // Return false to avoid 'b' if we need switching to THUMB mode.
+    return false;
+  }
+#endif
+
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+
+  if (ForceUnreachable) {
+    // Only addresses from CodeCache can be treated as reachable.
+    if (target < CodeCache::low_bound() || CodeCache::high_bound() <= target) {
+      return false;
+    }
+  }
+
+  intptr_t loffset = (intptr_t)target - (intptr_t)cl;
+  intptr_t hoffset = (intptr_t)target - (intptr_t)ch;
+
+  return check26(loffset - 8) && check26(hoffset - 8);
+}
+
+bool MacroAssembler::_cache_fully_reachable() {
+  address cl = first_cache_address();
+  address ch = last_cache_address();
+  return _reachable_from_cache(cl) && _reachable_from_cache(ch);
+}
+
+bool MacroAssembler::reachable_from_cache(address target) {
+  assert(CodeCache::contains(pc()), "not supported");
+  return _reachable_from_cache(target);
+}
+
+bool MacroAssembler::cache_fully_reachable() {
+  return _cache_fully_reachable();
+}
+
+// IMPORTANT: does not generate mt-safe patchable code
+void MacroAssembler::call(address target, RelocationHolder rspec, Condition cond) {
+  Register scratch = lr;
+  assert(rspec.type() == relocInfo::runtime_call_type || rspec.type() == relocInfo::none, "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rspec);
+    bl(target, cond);
+    return;
+  }
+
+  mov(scratch, (intptr_t)target, cond);
+  bl(scratch, cond);
+}
+
+// IMPORTANT: does not generate mt-safe patchable code. C2 only uses this method
+// for calls into runtime which do not need mt-safe patching
+void MacroAssembler::jump(address target, relocInfo::relocType rtype, Register scratch, Condition cond) {
+  assert((rtype == relocInfo::runtime_call_type) || (rtype == relocInfo::none), "not supported");
+  if (reachable_from_cache(target)) {
+    relocate(rtype);
+    b(target, cond);
+    return;
+  }
+
+  mov(scratch, (intptr_t)target, cond);
+  b(scratch, cond);
+}
+
+void MacroAssembler::arm_stack_overflow_check(int frame_size_in_bytes, Register tmp) {
+  // Version of AbstractAssembler::generate_stack_overflow_check optimized for ARM
+  if (UseStackBanging) {
+    const int page_size = os::vm_page_size();
+
+    sub(tmp, sp, StackShadowPages*page_size);
+    strb(r0, Address(tmp));
+    for (; frame_size_in_bytes >= page_size; frame_size_in_bytes -= 0xff0) {
+      strb(r0, pre(tmp, -0xff0));
+    }
+  }
+}
+
+void MacroAssembler::floating_cmp(Register dst) {
+  vmrs(dst);
+  orr(dst, dst, 0x08000000);
+  eor(dst, dst, dst, lsl(3));
+  mov(dst, dst, asr(30));
+}
+
+void MacroAssembler::fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2) {
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label fast_lock, done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    Label failed;
+    biased_locking_enter(Roop, Rmark, Rscratch, Rscratch2, false, done, &failed);
+    bind(failed);
+  }
+
+  ldr(Rmark, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  tst(Rmark, markOopDesc::unlocked_value);
+  b(fast_lock, Assembler::NE);
+
+  // Check for recursive lock
+  // See comments in InterpreterMacroAssembler::lock_object for
+  // explanations on the fast recursive locking check.
+  // -1- test low 2 bits
+  movs(Rscratch, Rmark, lsl(30));
+  // -2- test (hdr - SP) if the low two bits are 0
+  sub(Rscratch, Rmark, sp, Assembler::EQ);
+  movs(Rscratch, Rscratch, lsr(exact_log2(os::vm_page_size())), Assembler::EQ);
+  // If still 'eq' then recursive locking OK
+  // set to zero if recursive lock, set to non zero otherwise (see discussion in JDK-8153107)
+  str(Rscratch, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+  b(done);
+
+  bind(fast_lock);
+  str(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+
+  membar(StoreStore);
+  ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  cmp(Rscratch, Rmark);
+  strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
+  cmp(Rscratch, 0, Assembler::EQ);
+  membar(AnyAny);
+
+  bind(done);
+}
+
+void MacroAssembler::fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2) {
+  Register Rmark      = Rscratch2;
+
+  assert(Roop != Rscratch, "");
+  assert(Roop != Rmark, "");
+  assert(Rbox != Rscratch, "");
+  assert(Rbox != Rmark, "");
+
+  Label done;
+
+  if (UseBiasedLocking && !UseOptoBiasInlining) {
+    biased_locking_exit(Roop, Rscratch, done);
+  }
+
+  ldr(Rmark, Address(Rbox, BasicLock::displaced_header_offset_in_bytes()));
+  // If hdr is NULL, we've got recursive locking and there's nothing more to do
+  cmp(Rmark, 0);
+  b(done, Assembler::EQ);
+
+  // Restore the object header
+  membar(AnyAny);
+  ldrex(Rscratch, Address(Roop, oopDesc::mark_offset_in_bytes()));
+  cmp(Rscratch, Rmark);
+  strex(Rscratch, Rbox, Address(Roop, oopDesc::mark_offset_in_bytes()), Assembler::EQ);
+  cmp(Rscratch, 0, Assembler::EQ);
+
+  membar(StoreLoad);
+
+  bind(done);
+}
+
+#endif
--- /dev/null	2018-09-25 19:25:11.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/macroAssembler_aarch32.hpp	2018-09-25 19:25:11.000000000 +0300
@@ -0,0 +1,1115 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP
+#define CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP
+
+#include "asm/assembler.hpp"
+#include "nativeInst_aarch32.hpp"
+
+// MacroAssembler extends Assembler by frequently used macros.
+//
+// Instructions for which a 'better' code sequence exists depending
+// on arguments should also go in here.
+
+class MacroAssembler: public Assembler {
+  friend class LIR_Assembler;
+  friend class G1BarrierSetAssembler;
+
+  using Assembler::mov;
+
+ protected:
+
+  // Support for VM calls
+  //
+  // This is the base routine called by the different versions of call_VM_leaf. The interpreter
+  // may customize this version by overriding it for its purposes (e.g., to save/restore
+  // additional registers when doing a VM call).
+  virtual void call_VM_leaf_base(
+    address entry_point,               // the entry point
+    int     number_of_arguments,        // the number of arguments to pop after the call
+    Label *retaddr = NULL
+  );
+
+  virtual void call_VM_leaf_base(
+    address entry_point,               // the entry point
+    int     number_of_arguments,        // the number of arguments to pop after the call
+    Label &retaddr) {
+    call_VM_leaf_base(entry_point, number_of_arguments, &retaddr);
+  }
+
+  // This is the base routine called by the different versions of call_VM. The interpreter
+  // may customize this version by overriding it for its purposes (e.g., to save/restore
+  // additional registers when doing a VM call).
+  //
+  // If no java_thread register is specified (noreg) than rthread will be used instead. call_VM_base
+  // returns the register which contains the thread upon return. If a thread register has been
+  // specified, the return value will correspond to that register. If no last_java_sp is specified
+  // (noreg) than rsp will be used instead.
+  virtual void call_VM_base(           // returns the register containing the thread upon return
+    Register oop_result,               // where an oop-result ends up if any; use noreg otherwise
+    Register java_thread,              // the thread if computed before     ; use noreg otherwise
+    Register last_java_sp,             // to set up last_Java_frame in stubs; use noreg otherwise
+    address  entry_point,              // the entry point
+    int      number_of_arguments,      // the number of arguments (w/o thread) to pop after the call
+    bool     check_exceptions          // whether to check for pending exceptions after return
+  );
+
+ public:
+  void init_unseen_bytecodes();
+  MacroAssembler(CodeBuffer* code) : Assembler(code) { init_unseen_bytecodes();}
+
+  // These routines should emit JVMTI PopFrame and ForceEarlyReturn handling code.
+  // The implementation is only non-empty for the InterpreterMacroAssembler,
+  // as only the interpreter handles PopFrame and ForceEarlyReturn requests.
+  virtual void check_and_handle_popframe(Register java_thread);
+  virtual void check_and_handle_earlyret(Register java_thread);
+
+  void safepoint_poll(Label& slow_path);
+  void safepoint_poll_acquire(Label& slow_path);
+
+  void call_VM_helper(Register oop_result, address entry_point, int number_of_arguments, bool check_exceptions = true);
+
+  // Biased locking support
+  // obj_reg must be loaded up with the appropriate values.
+  // swap_reg is killed.
+  // tmp_reg and tmp_reg2 shall be supplied.
+  // Optional slow case is for implementations (interpreter and C1) which branch to
+  // slow case directly. Leaves condition codes set for C2's Fast_Lock node.
+  // Returns offset of first potentially-faulting instruction for null
+  // check info (currently consumed only by C1). If
+  // swap_reg_contains_mark is true then returns -1 as it is assumed
+  // the calling code has already passed any potential faults.
+  int biased_locking_enter(Register obj_reg,
+                           Register swap_reg, Register tmp_reg, Register tmp_reg2,
+                           bool swap_reg_contains_mark,
+                           Label& done, Label* slow_case = NULL,
+                           BiasedLockingCounters* counters = NULL);
+  void biased_locking_exit (Register obj_reg, Register temp_reg, Label& done);
+
+
+  // Helper functions for statistics gathering.
+  // Unconditional atomic increment.
+  void atomic_inc(Register counter_addr, Register tmp);
+  void atomic_inc(Address counter_addr, Register tmp1, Register tmp2) {
+    lea(tmp1, counter_addr);
+    atomic_inc(tmp1, tmp2);
+  }
+  // Load Effective Address
+  void lea(Register r, const Address &a) {
+    InstructionMark im(this);
+    code_section()->relocate(inst_mark(), a.rspec());
+    a.lea(this, r);
+  }
+
+  virtual void _call_Unimplemented(address call_site) {
+    mov(rscratch2, call_site);
+    stop("HALT");
+  }
+
+#define call_Unimplemented() _call_Unimplemented((address)__PRETTY_FUNCTION__)
+
+// macro assembly operations needed for aarch32
+
+private:
+
+  int push(unsigned int bitset, Register stack);
+  int pop(unsigned int bitset, Register stack);
+
+public:
+
+  void mov(Register dst, Address a, Condition cond = C_DFLT);
+
+  void push(RegSet regs, Register stack) { if (regs.bits()) push(regs.bits(), stack); }
+  void pop(RegSet regs, Register stack) { if (regs.bits()) pop(regs.bits(), stack); }
+
+  // now mov instructions for loading absolute addresses and 32bit immediates
+
+  inline void mov(Register dst, address addr, Condition cond = C_DFLT) {
+    // TODO: Do Address end up as address and then passing through this method, after
+    // being marked for relocation elsewhere? If not (as I suspect) then this can
+    // be relaxed to mov_immediate to potentially produce shorter code sequences.
+    mov_immediate32(dst, (uint32_t)addr, cond, false);
+  }
+
+  inline void mov(Register dst, long l, Condition cond = C_DFLT) {
+    mov(dst, (uint32_t)l, cond);
+  }
+  inline void mov(Register dst, unsigned long l, Condition cond = C_DFLT) {
+    mov(dst, (uint32_t)l, cond);
+  }
+  inline void mov(Register dst, int i, Condition cond = C_DFLT) {
+    mov(dst, (uint32_t)i, cond);
+  }
+#ifdef COMPILER2
+  inline void mov(Register dst, jlong i, Condition cond = C_DFLT) {
+    assert(!(i >> 32), "must be 32-bit"); // really a 32-bit value contained in jlong. not sign extended!
+    mov(dst, (uint32_t)i, cond);
+  }
+  inline void mov(Register dst, julong i, Condition cond = C_DFLT) {
+    assert(!(i >> 32), "must be 32-bit");
+    mov(dst, (uint32_t)i, cond);
+  }
+#endif
+  inline void mov(Register dst, uint32_t i, Condition cond = C_DFLT) {
+    mov_immediate(dst, i, cond, false);
+  }
+
+  inline void mov(Register dst, Register src, Condition cond = C_DFLT) {
+    Assembler::mov(dst, src, cond);
+  }
+  inline void mov(Register dst, Register src, shift_op shift,
+                  Condition cond = C_DFLT) {
+    Assembler::mov(dst, src, shift, cond);
+  }
+  // TODO add sflag compatibility
+  void movptr(Register r, uintptr_t imm32, Condition cond = C_DFLT);
+
+  // to reduce the chance for mistake these shall overload the mvn(Register, Register) variant
+  using Assembler::mvn;
+  using Assembler::mvns;
+  inline void mvn(Register dst, uint32_t i, Condition cond = C_DFLT) {
+    mov_immediate(dst, ~i, cond, false);
+  }
+  inline void mvns(Register dst, uint32_t i, Condition cond = C_DFLT) {
+    mov_immediate(dst, ~i, cond, true);
+  }
+
+  void ret(Register reg);
+
+  // Both of these are aarch64 instructions that can easily be emulated
+  // Note that this does not quite have the same semantics as aarch64
+  // version as this updates the s flag.
+  void cbz(Register r, Label& l) {
+    cmp(r, 0);
+    b(l, EQ);
+  }
+  void cbnz(Register r, Label& l) {
+    cmp(r, 0);
+    b(l, NE);
+  }
+  void tbz(Register r, unsigned bit, Label& l) {
+    tst(r, 1 << bit);
+    b(l, EQ);
+  }
+  void tbnz(Register r, unsigned bit, Label& l) {
+    tst(r, 1 << bit);
+    b(l, NE);
+  }
+
+  void addmw(Address a, Register incr, Register scratch) {
+    ldr(scratch, a);
+    add(scratch, scratch, incr);
+    str(scratch, a);
+  }
+
+  // Add constant to memory word
+  void addmw(Address a, int imm, Register scratch) {
+    ldr(scratch, a);
+    if (imm > 0)
+      add(scratch, scratch, (unsigned)imm);
+    else
+      sub(scratch, scratch, (unsigned)-imm);
+    str(scratch, a);
+  }
+
+// XXX stubs
+
+  // macro instructions for accessing and updating floating point
+  // status register
+  //
+  // FPSR : op1 == 011
+  //        CRn == 0100
+  //        CRm == 0100
+  //        op2 == 001
+
+  inline void get_fpsr(Register reg = as_Register(0xf)) {
+    vmrs(reg);
+  }
+
+  inline void set_fpsr(Register reg) {
+    vmsr(reg);
+  }
+
+  inline void clear_fpsr() {
+    mov(rscratch1, 0);
+    set_fpsr(rscratch1);
+  }
+
+  // Support for NULL-checks
+  //
+  // Generates code that causes a NULL OS exception if the content of reg is NULL.
+  // If the accessed location is M[reg + offset] and the offset is known, provide the
+  // offset. No explicit code generation is needed if the offset is within a certain
+  // range (0 <= offset <= page_size).
+
+  virtual void null_check(Register reg, int offset = -1);
+  static bool needs_explicit_null_check(intptr_t offset);
+
+  static address target_addr_for_insn(address insn_addr, unsigned insn);
+  static address target_addr_for_insn(address insn_addr) {
+    unsigned insn = *(unsigned*)insn_addr;
+    return target_addr_for_insn(insn_addr, insn);
+  }
+
+  // Required platform-specific helpers for Label::patch_instructions.
+  // They _shadow_ the declarations in AbstractAssembler, which are undefined.
+  static int pd_patch_instruction_size(address branch, address target);
+  static void pd_patch_instruction(address branch, address target) {
+    pd_patch_instruction_size(branch, target);
+  }
+
+#ifndef PRODUCT
+  static void pd_print_patched_instruction(address branch);
+#endif
+
+  static int patch_oop(address insn_addr, address o);
+
+  // The following 4 methods return the offset of the appropriate move instruction
+
+  // Support for fast byte/short loading with zero extension (depending on particular CPU)
+  int load_unsigned_byte(Register dst, Address src);
+  int load_unsigned_short(Register dst, Address src);
+
+  // Support for fast byte/short loading with sign extension (depending on particular CPU)
+  int load_signed_byte(Register dst, Address src);
+  int load_signed_short(Register dst, Address src);
+
+  // Support for sign-extension (hi:lo = extend_sign(lo))
+  void extend_sign(Register hi, Register lo);
+
+  // Load and store values by size and signed-ness
+  void load_sized_value(Register dst, Address src, size_t size_in_bytes, bool is_signed, Register dst2 = noreg);
+  void store_sized_value(Address dst, Register src, size_t size_in_bytes, Register src2 = noreg);
+
+  // Support for inc/dec with optimal instruction selection depending on value.
+  // increment()/decrement() calls with an address destination will need to use
+  // rscratch1 to load the value to be incremented. increment()/decrement()
+  // calls which add or subtract a constant value greater than 2^12 will need
+  // to use rscratch2 to hold the constant. So, a register increment()/
+  // decrement() may trash rscratch2, and an address increment()/decrement()
+  // may trash rscratch1 and rscratch2.
+  void decrement(Register reg, int value = 1);
+  void decrement(Address dst, int value = 1);
+  void increment(Register reg, int value = 1);
+  void increment(Address dst, int value = 1);
+
+  // Alignment
+  void align(int modulus);
+
+  // Stack frame creation/removal
+  //
+  // VM and intepreter code may have different stack layouts. enter/leave default layout
+  // is selected by FrameAPCS option. One can make enter/leave to use VMFrameAPCS instead.
+  void enter(bool as_apcs = FrameAPCS) {
+    if (as_apcs) {
+      mov(rscratch2, sp);
+      stmdb(sp, RegSet::of(rfp, rscratch2, lr, r15_pc).bits());
+      sub(rfp, rscratch2, 4);
+    } else {
+      stmdb(sp, RegSet::of(rfp, lr).bits());
+      add(rfp, sp, wordSize);
+    }
+  }
+
+  void leave(bool as_apcs = FrameAPCS) {
+    if (as_apcs) {
+      ldmea(rfp, RegSet::of(rfp, sp, lr).bits(), false/*wb*/);
+    } else {
+      sub(sp, rfp, wordSize);
+      ldmia(sp, RegSet::of(rfp, lr).bits());
+    }
+  }
+
+  // Support for getting the JavaThread pointer (i.e.; a reference to thread-local information)
+  // The pointer will be loaded into the thread register.
+  void get_thread(Register thread);
+
+  enum ret_type { ret_type_void, ret_type_integral, ret_type_float, ret_type_double};
+  // Support for VM calls
+  //
+  // It is imperative that all calls into the VM are handled via the call_VM macros.
+  // They make sure that the stack linkage is setup correctly. call_VM's correspond
+  // to ENTRY/ENTRY_X entry points while call_VM_leaf's correspond to LEAF entry points.
+
+
+  void call_VM(Register oop_result,
+               address entry_point,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1, Register arg_2,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               address entry_point,
+               Register arg_1, Register arg_2, Register arg_3,
+               bool check_exceptions = true);
+
+  // Overloadings with last_Java_sp
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               int number_of_arguments = 0,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, bool
+               check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, Register arg_2,
+               bool check_exceptions = true);
+  void call_VM(Register oop_result,
+               Register last_java_sp,
+               address entry_point,
+               Register arg_1, Register arg_2, Register arg_3,
+               bool check_exceptions = true);
+
+  void get_vm_result  (Register oop_result, Register thread);
+  void get_vm_result_2(Register metadata_result, Register thread);
+
+  // These always tightly bind to MacroAssembler::call_VM_base
+  // bypassing the virtual implementation
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, int number_of_arguments = 0, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, bool check_exceptions = true);
+  void super_call_VM(Register oop_result, Register last_java_sp, address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4, bool check_exceptions = true);
+
+  void call_VM_leaf(address entry_point,
+                    int number_of_arguments = 0);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1, Register arg_2);
+  void call_VM_leaf(address entry_point,
+                    Register arg_1, Register arg_2, Register arg_3);
+
+  // These always tightly bind to MacroAssembler::call_VM_leaf_base
+  // bypassing the virtual implementation
+  void super_call_VM_leaf(address entry_point);
+  void super_call_VM_leaf(address entry_point, Register arg_1);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3);
+  void super_call_VM_leaf(address entry_point, Register arg_1, Register arg_2, Register arg_3, Register arg_4);
+
+  // last Java Frame (fills frame anchor)
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           address last_java_pc,
+                           Register scratch);
+
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           Label &last_java_pc,
+                           Register scratch);
+
+  void set_last_Java_frame(Register last_java_sp,
+                           Register last_java_fp,
+                           Register last_java_pc,
+                           Register scratch);
+
+  void reset_last_Java_frame(Register thread);
+
+  // thread in the default location (rthread)
+  void reset_last_Java_frame(bool clear_fp);
+
+  // Stores
+  void store_check(Register obj);                // store check for obj - register is destroyed afterwards
+  void store_check(Register obj, Address dst);   // same as above, dst is exact store location (reg. is destroyed)
+
+  void resolve_jobject(Register value, Register thread, Register tmp);
+
+  // C 'boolean' to Java boolean: x == 0 ? 0 : 1
+  void c2bool(Register x);
+
+  // oop manipulations
+  void load_klass(Register dst, Register src);
+  void store_klass(Register dst, Register src);
+  void cmp_klass(Register oop, Register trial_klass, Register tmp);
+
+  void resolve_oop_handle(Register result, Register tmp);
+  void load_mirror(Register dst, Register method, Register tmp);
+
+  void access_load_word_at(BasicType type, DecoratorSet decorators, Register dst, Address src,
+                           Register tmp1, Register tmp_thread);
+
+  void access_store_word_at(BasicType type, DecoratorSet decorators, Address dst, Register src,
+                            Register tmp1, Register tmp_thread);
+
+  void access_load_tos_at(BasicType type, DecoratorSet decorators, Address src,
+                          Register tmp1, Register tmp_thread);
+
+  void access_store_tos_at(BasicType type, DecoratorSet decorators, Address dst,
+                           Register tmp1, Register tmp_thread);
+
+  void load_heap_oop(Register dst, Address src, Register tmp1 = noreg,
+                     Register thread_tmp = noreg, DecoratorSet decorators = 0);
+
+  void load_heap_oop_not_null(Register dst, Address src, Register tmp1 = noreg,
+                              Register thread_tmp = noreg, DecoratorSet decorators = 0);
+  void store_heap_oop(Address dst, Register src, Register tmp1 = noreg,
+                      Register tmp_thread = noreg, DecoratorSet decorators = 0);
+
+  // Used for storing NULL. All other oop constants should be
+  // stored using routines that take a jobject.
+  void store_heap_oop_null(Address dst, Register tmp);
+
+  void load_prototype_header(Register dst, Register src);
+
+  void store_klass_gap(Register dst, Register src);
+
+  // This dummy is to prevent a call to store_heap_oop from
+  // converting a zero (like NULL) into a Register by giving
+  // the compiler two choices it can't resolve
+
+  void store_heap_oop(Address dst, void* dummy);
+
+  // Push and pop everything that might be clobbered by a native
+  // runtime call except rscratch1 and rscratch2.  (They are always
+  // scratch, so we don't have to protect them.)  Only save the f0-f15
+  // and do not save f32-f63 even if present.
+  void push_call_clobbered_registers();
+  void pop_call_clobbered_registers();
+
+  void push_CPU_state();
+  void pop_CPU_state() ;
+
+  // Round up to a power of two
+  void round_to(Register reg, int modulus);
+
+  // allocation
+  void eden_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+  void tlab_allocate(
+    Register obj,                      // result: pointer to object after successful allocation
+    Register var_size_in_bytes,        // object size in bytes if unknown at compile time; invalid otherwise
+    int      con_size_in_bytes,        // object size in bytes if   known at compile time
+    Register t1,                       // temp register
+    Register t2,                       // temp register
+    Label&   slow_case                 // continuation point if fast allocation fails
+  );
+
+  void zero_memory(Register addr, Register len, Register t1);
+  void verify_tlab();
+
+  // interface method calling
+  void lookup_interface_method(Register recv_klass,
+                               Register intf_klass,
+                               RegisterOrConstant itable_index,
+                               Register method_result,
+                               Register scan_temp,
+                               Label& no_such_interface,
+                               bool return_method = true);
+
+  // virtual method calling
+  // n.b. x86 allows RegisterOrConstant for vtable_index
+  void lookup_virtual_method(Register recv_klass,
+                             RegisterOrConstant vtable_index,
+                             Register method_result);
+
+  // Test sub_klass against super_klass, with fast and slow paths.
+
+  // The fast path produces a tri-state answer: yes / no / maybe-slow.
+  // One of the three labels can be NULL, meaning take the fall-through.
+  // If super_check_offset is -1, the value is loaded up from super_klass.
+  // No registers are killed, except temp_reg.
+  void check_klass_subtype_fast_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     Label* L_slow_path,
+                RegisterOrConstant super_check_offset = RegisterOrConstant(-1));
+
+  // The rest of the type check; must be wired to a corresponding fast path.
+  // It does not repeat the fast path logic, so don't use it standalone.
+  // The temp_reg and temp2_reg can be noreg, if no temps are available.
+  // Updates the sub's secondary super cache as necessary.
+  // If set_cond_codes, condition codes will be Z on success, NZ on failure.
+  void check_klass_subtype_slow_path(Register sub_klass,
+                                     Register super_klass,
+                                     Register temp_reg,
+                                     Register temp2_reg,
+                                     Label* L_success,
+                                     Label* L_failure,
+                                     bool set_cond_codes = false);
+
+  // Simplified, combined version, good for typical uses.
+  // Falls through on failure.
+  void check_klass_subtype(Register sub_klass,
+                           Register super_klass,
+                           Register temp_reg,
+                           Label& L_success);
+
+  Address argument_address(RegisterOrConstant arg_slot, int extra_slot_offset = 0);
+
+
+  // Debugging
+
+  // only if +VerifyOops
+  void verify_oop(Register reg, const char* s = "broken oop");
+  void verify_oop_addr(Address addr, const char * s = "broken oop addr");
+
+// TODO: verify method and klass metadata (compare against vptr?)
+  void _verify_method_ptr(Register reg, const char * msg, const char * file, int line) {}
+  void _verify_klass_ptr(Register reg, const char * msg, const char * file, int line){}
+
+#define verify_method_ptr(reg) _verify_method_ptr(reg, "broken method " #reg, __FILE__, __LINE__)
+#define verify_klass_ptr(reg) _verify_klass_ptr(reg, "broken klass " #reg, __FILE__, __LINE__)
+
+  // only if +VerifyFPU
+  void verify_FPU(int stack_depth, const char* s = "illegal FPU state");
+
+  // prints msg, dumps registers and stops execution
+  void stop(const char* msg);
+
+  // prints msg and continues
+  void warn(const char* msg);
+
+  static void debug32(char* msg, int32_t pc, int32_t regs[]);
+
+  void untested()                                { stop("untested"); }
+
+  void unimplemented(const char* what = "");
+
+#define should_not_reach_here() should_not_reach_here_line(__FILE__, __LINE__)
+  void should_not_reach_here_line(const char *file, int line) {
+#ifdef ASSERT
+    mov(rscratch1, line);
+    reg_printf_important(file);
+    reg_printf_important(": %d", rscratch1);
+#endif
+    stop("should_not_reach_here");
+  }
+
+  // Stack overflow checking
+  void bang_stack_with_offset(int offset) {
+    // stack grows down, caller passes positive offset
+    assert(offset > 0, "must bang with negative offset");
+    // bang with random value from r0
+    if (operand_valid_for_add_sub_immediate(offset)) {
+      sub(rscratch2, sp, offset);
+      strb(r0, Address(rscratch2));
+    } else {
+      mov(rscratch2, offset);
+      strb(r0, Address(sp, rscratch2, Assembler::lsl(), Address::SUB));
+    }
+  }
+
+  // Writes to stack successive pages until offset reached to check for
+  // stack overflow + shadow pages.  Also, clobbers tmp
+  void bang_stack_size(Register size, Register tmp);
+
+  // Check for reserved stack access in method being exited (for JIT)
+  void reserved_stack_check();
+
+  virtual RegisterOrConstant delayed_value_impl(intptr_t* delayed_value_addr,
+                                                Register tmp,
+                                                int offset);
+
+  // Support for serializing memory accesses between threads
+  void serialize_memory(Register thread, Register tmp);
+
+  // Arithmetics
+
+  void addptr(Address dst, int32_t src) {
+    lea(rscratch2, dst);
+    ldr(rscratch1, Address(rscratch2));
+    add(rscratch1, rscratch1, src);
+    str(rscratch1, Address(rscratch2));
+  }
+
+  void cmpptr(Register src1, Address src2);
+  void cmpoop(Register obj1, Register obj2);
+
+  void cmpxchg_obj_header(Register oldv, Register newv, Register obj, Register tmp,
+                          Label &suceed, Label *fail);
+  void cmpxchgptr(Register oldv, Register newv, Register addr, Register tmp,
+                  Label &suceed, Label *fail);
+
+  void cmpxchgw(Register oldv, Register newv, Register addr, Register tmp,
+                  Label &suceed, Label *fail);
+
+  void atomic_add(Register prev, RegisterOrConstant incr, Register addr);
+  void atomic_addw(Register prev, RegisterOrConstant incr, Register addr);
+
+  void atomic_xchg(Register prev, Register newv, Register addr);
+  void atomic_xchgw(Register prev, Register newv, Register addr);
+
+  void orptr(Address adr, RegisterOrConstant src) {
+    ldr(rscratch1, adr);
+    if (src.is_register())
+      orr(rscratch1, rscratch1, src.as_register());
+    else
+      orr(rscratch1, rscratch1, src.as_constant());
+    str(rscratch1, adr);
+  }
+
+  // Calls
+
+  void trampoline_call(Address entry, CodeBuffer *cbuf = NULL);
+
+  static bool far_branches() {
+    return ReservedCodeCacheSize > branch_range;
+  }
+
+  // Jumps that can reach anywhere in the code cache.
+  // Trashes tmp.
+  void far_call(Address entry, CodeBuffer *cbuf = NULL);
+  void far_jump(Address entry, CodeBuffer *cbuf = NULL, Register tmp = rscratch1);
+
+  static int far_branch_size() {
+    if (far_branches()) {
+      if (VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))  {
+        return 3 * NativeInstruction::arm_insn_sz;  // movw, movt, br
+      } else {
+        return 5 * NativeInstruction::arm_insn_sz;  // mov, 3 orr, br
+      }
+    } else {
+      return NativeInstruction::arm_insn_sz; // br
+    }
+  }
+
+  // Emit the CompiledIC call idiom
+  void ic_call(address entry, jint method_index = 0);
+
+  // Data
+  void mov_metadata(Register dst, Metadata* obj);
+  Address allocate_metadata_address(Metadata* obj);
+  Address constant_oop_address(jobject obj);
+
+  void movoop(Register dst, jobject obj, bool immediate = false);
+
+  void far_load(Register dst, address addr);
+  void far_load_oop(Register dst, int oop_index);
+  void far_load_metadata(Register dst, int metadata_index);
+  void far_load_const(Register dst, address const);
+
+
+  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  void kernel_crc32(Register crc, Register buf, Register len,
+        Register table0, Register table1, Register table2, Register table3,
+        Register tmp, Register tmp2, Register tmp3, int is_crc32c);
+  //AES code for com.sun.crypto.provider.AESCrypt::encryptBlock() intrinsic.
+  void kernel_aescrypt_encryptBlock(Register from, Register to, Register key, Register keylen,
+        Register table1,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7);
+  void kernel_aescrypt_decryptBlock(Register from, Register to, Register key, Register keylen,
+        Register table1,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7);
+  void kernel_aescrypt_round(Register table_te, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register a, Register tmp1, Register tmp2);
+  void kernel_aescrypt_firstRound(Register in, Register key,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7);
+  void kernel_aescrypt_lastRound(
+        Register table_te, Register key, Register to,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6, Register t7);
+  void kernel_aescrypt_lastRound_cbc(
+        Register table_te,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6);
+
+  void kernel_aescrypt_encrypt(Register from, Register to, Register key, Register rvec,
+        Register len, Register keylen, Register table1,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6);
+  void kernel_aescrypt_decrypt(Register from, Register to, Register key, Register rvec,
+        Register len, Register keylen, Register table1,
+        Register t0, Register t1, Register t2, Register t3,
+        Register t4, Register t5, Register t6);
+
+  void sha_round1(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh);
+
+  void sha_round2(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh);
+
+  void sha_round3(Register st_b, Register st_c, Register st_d,
+        Register tmp, Register st_f, int sh);
+
+  void sha_w0(FloatRegister w16, FloatRegister w14,
+        FloatRegister w8, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, bool update);
+
+  void sha_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4,
+        FloatRegister st_k, FloatRegister st_kw, Register counter, Register rtmp,
+        bool update = true);
+
+  void kernel_sha_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register st_a, Register st_b,
+        Register st_c, Register st_d, Register st_e,
+        Register tmp, Register cunter2, Register st_new_a, Register st_w);
+
+  void sha256_implCompress_iter(
+      Register ra, Register rb, Register rc, Register rd,
+      Register re, Register rf, Register rg, Register rh,
+      FloatRegister Dkw1, FloatRegister Dkw2,
+      Register step,
+      Register tmp,
+      Register ra2, Register re2);
+  void sha256_implCompress_iter0(
+      Register Da, Register Db, Register Dc, Register Dd,
+      Register De, Register Df, Register Dg, Register Dh,
+      FloatRegister Dkw, int index,
+      Register Dtmp,
+      Register Dnew_a, Register Dnew_e);
+  void sha256_w0(
+      FloatRegister w_m16, FloatRegister w_m15, FloatRegister w_m14,
+      FloatRegister w_m7, FloatRegister w_m6,
+      FloatRegister w_m2,
+      FloatRegister Qtmp_S0, FloatRegister Qtmp_S1,
+      FloatRegister Qtmp1);
+  void sha256_w(FloatRegister w16, FloatRegister w14,
+        FloatRegister w12, FloatRegister w10, FloatRegister w8,
+        FloatRegister w6, FloatRegister w4, FloatRegister w2,
+        FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3,
+        FloatRegister st_kw, Register counter, Register rtmp);
+
+  void kernel_sha256_implCompress(Register from, Register state,
+        Register counter, Register table_k,
+        Register ra, Register rb, Register rc, Register rd, Register re,
+        Register rf, Register rg, Register rh,
+        Register ra2, Register re2);
+
+  void kernel_sha512_implCompress(Register from, Register state,
+        Register counter, Register table_k);
+
+  void sha512_sigma(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Dsigma, int sh1, int sh2, int sh3);
+  void sha512_delta(FloatRegister x,
+        FloatRegister Qtmp, FloatRegister Ddelta, int sh1, int sh2, int sh3);
+  void sha512_ch(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dch);
+  void sha512_maj(FloatRegister x, FloatRegister y, FloatRegister z,
+        FloatRegister Dtmp, FloatRegister Dmaj);
+
+  // Stack push and pop individual 64 bit registers
+  void push(Register src);
+  void pop(Register dst);
+
+  // push all registers onto the stack
+  void pusha();
+  void popa();
+
+  void repne_scan(Register addr, Register value, Register count,
+                  Register scratch);
+  void repne_scanw(Register addr, Register value, Register count,
+                   Register scratch);
+
+  // Form an address from base + offset in Rd. Rd may or may not actually be
+  // used: you must use the Address that is returned. It is up to you to ensure
+  // that the shift provided matches the size of your data.
+  Address form_address(Register Rd, Register base, long byte_offset, int shift);
+
+ public:
+
+  void ldr_constant(Register dest, const Address &const_addr) {
+    if (NearCpool) {
+      ldr(dest, const_addr);
+    } else {
+      mov(dest, InternalAddress(const_addr.target()));
+      ldr(dest, dest);
+    }
+  }
+
+  address read_polling_page(Register r, address page, relocInfo::relocType rtype);
+  address read_polling_page(Register r, relocInfo::relocType rtype);
+  void get_polling_page(Register dest, address page, relocInfo::relocType rtype);
+
+  // BigInteger intrinsics
+  void multiply_to_len(Register x, Register xlen, Register y, Register ylen,
+                        Register z, Register zlen,
+                        Register tmp1, Register tmp2, Register tmp3, Register tmp4,
+                        Register tmp5, Register tmp6);
+  void mul_add(Register out, Register in, Register offset, Register len, Register k,
+                        Register tmp1, Register tmp2, Register tmp3);
+
+  // CRC32 code for java.util.zip.CRC32::updateBytes() instrinsic.
+  void update_byte_crc32(Register crc, Register val, Register table);
+  void update_word_crc32(Register crc, Register v, Register tmp, Register tmp2,
+        Register table0, Register table1, Register table2, Register table3);
+//  void update_byte_crc32c(Register crc, Register val, Register table);
+  void update_word_crc32c(Register crc, Register v, Register tmp, Register tmp2,
+        Register table0, Register table1, Register table2, Register table3);
+
+  // Auto dispatch for barriers isb, dmb & dsb.
+  void isb() {
+    if(VM_Version::features() & FT_ARMV7) {
+      Assembler::isb();
+    } else {
+      cp15isb();
+    }
+  }
+
+  void dsb(enum barrier option) {
+    if(VM_Version::features() & FT_ARMV7) {
+      Assembler::dsb(option);
+    } else {
+      cp15dsb();
+    }
+  }
+
+  void dmb(enum barrier option) {
+    if(VM_Version::features() & FT_ARMV7) {
+      Assembler::dmb(option);
+    } else {
+      cp15dmb();
+    }
+  }
+
+  void membar(Membar_mask_bits order_constraint) {
+    dmb(Assembler::barrier(order_constraint));
+  }
+
+  // ISB may be needed because of a safepoint
+  void maybe_isb() { MacroAssembler::isb(); }
+
+  // Helper functions for 64-bit multipliction, division and remainder
+  // does <Rd+1:Rd> = <Rn+1:Rn> * <Rm+1:Rm>
+  void mult_long(Register Rd, Register Rn, Register Rm);
+  // does <Rdh:Rd> = <Rnh:Rn> * <Rmh:Rm>
+  void mult_long(Register Rd, Register Rdh, Register Rn, Register Rnh, Register Rm, Register Rmh);
+
+ private:
+  void divide32(Register res, Register num, Register den, bool want_mod);
+ public:
+  // <Rd+1:Rd> = <Rn+1:Rn> / <Rm+1:Rm>
+  // <Rd+1:Rd> = <Rn+1:Rn> % <Rm+1:Rm>
+  // <Rd> = <Rn> / <Rm>
+  // <Rd> = <Rn> % <Rm>
+  void divide(Register Rd, Register Rn, Register Rm, int width, bool want_remainder);
+
+  void extract_bits(Register dest, Register source, int lsb, int width);
+
+  // These functions require that the src/dst register is an even register
+  // and will emit LDREXD/STREXD if there are multiple cores and the procesor
+  // supports it. If there's only one core then LDRD/STRD will be emit instead.
+  // If the processor has multiple cores and doesn't support LDREXD/STREXD then
+  // LDRD/STRD will be emitted and a warning message printed.
+  void atomic_ldrd(Register Rt, Register RtII, Register Rbase);
+  void atomic_strd(Register Rt, Register RtII, Register Rbase,
+                   Register temp, Register tempII);
+
+ private:
+  // generic fallback ldrd generator. may need to use temporary register
+  // when register collisions are found
+  //
+  // since double_ld_failed_dispatch can introduce address manipulation instructions
+  // it should return offset of first load/store instruction that will be used
+  // while constructing implicit null check table
+  int double_ld_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+                            void (Assembler::* mul)(unsigned, const Address&, Condition),
+                            void (Assembler::* sgl)(Register, const Address&, Condition),
+                            Register Rtmp, Condition cond);
+  // ldrd/strd generator. can handle all strd cases and those ldrd where there
+  // are no register collisions
+  void double_ldst_failed_dispatch(Register Rt, Register Rt2, const Address& adr,
+                            void (Assembler::* mul)(unsigned, const Address&, Condition),
+                            void (Assembler::* sgl)(Register, const Address&, Condition),
+                            Condition cond);
+public:
+  // override ldrd/strd to perform a magic for when Rt + 1 != Rt2 or any other
+  // conditions which prevent to use single ldrd/strd insn. a pair of ldr/str
+  // is used instead then
+  //
+  // Since ldrd/strd macro can introduce address manipulation instructions
+  // it should return offset of first load/store instruction that will be used
+  // while constructing implicit null check table
+  using Assembler::ldrd;
+  int ldrd(Register Rt, Register Rt2, const Address& adr, Register Rmp = rscratch1, Condition cond = C_DFLT);
+  using Assembler::strd;
+  int strd(Register Rt, Register Rt2, const Address& adr, Condition cond = C_DFLT);
+
+private:
+  void bfc_impl(Register rd, int lsb, int width, Condition cond);
+public:
+  void bfc(Register Rd, int lsb, int width, Condition cond = C_DFLT) {
+    if (VM_Version::features() & (FT_ARMV6T2 | FT_ARMV7))
+      Assembler::bfc(Rd, lsb, width, cond);
+    else
+      bfc_impl(Rd, lsb, width, cond);
+  }
+
+  void align_stack() {
+    if (StackAlignmentInBytes > 4)
+      bic(sp, sp, StackAlignmentInBytes-1);
+  }
+
+#ifdef ASSERT
+  void verify_stack_alignment();
+#endif
+
+  // Debug helper
+  void save_machine_state();
+  void restore_machine_state();
+
+  static uint32_t bytecodes_until_print;
+  static uint32_t bytecodes_executed;
+  static int enable_debug;
+  static int enable_method_debug;
+  static int enable_debugging_static;
+
+
+  void bytecode_seen(Register bc_reg, Register scratch);
+  static void print_unseen_bytecodes();
+  void reg_printf_internal(bool important, const char *fmt, Register a = r0, Register b = r0, Register c = r0);
+  void reg_printf_important(const char *fmt, Register a = r0, Register b = r0, Register c = r0);
+  void reg_printf(const char *fmt, Register a = r0, Register b = r0, Register c = r0);
+  void print_method_entry(Register rmethod, bool native);
+  void print_method_exit(bool normal = true);
+  void get_bytecode(Register bc, Register dst);
+  static void print_cpool(InstanceKlass *klass);
+
+  void create_breakpoint();
+
+#ifdef COMPILER2
+  static bool _reachable_from_cache(address target);
+  bool reachable_from_cache(address target);
+  static bool _cache_fully_reachable();
+  bool cache_fully_reachable();
+
+  void call(address target, RelocationHolder rspec, Condition cond = Assembler::AL);
+
+  void call(address target,
+            relocInfo::relocType rtype = relocInfo::runtime_call_type,
+            Condition cond = Assembler::AL) {
+    call(target, Relocation::spec_simple(rtype), cond);
+  }
+
+  void jump(address target,
+          relocInfo::relocType rtype = relocInfo::runtime_call_type,
+          Register scratch = noreg,
+          Condition cond = Assembler::AL);
+
+  void jump(address dest, relocInfo::relocType rtype = relocInfo::runtime_call_type,
+          Condition cond = Assembler::AL) {
+    jump(dest, rtype, rscratch2, cond);
+  }
+
+  void mov_address(Register rd, address addr, RelocationHolder const& rspec) {
+    assert(rspec.type() != relocInfo::runtime_call_type, "do not use mov_address for runtime calls");
+    assert(rspec.type() != relocInfo::static_call_type, "do not use mov_address for relocable calls");
+    if (rspec.type() == relocInfo::none) {
+      // absolute address, relocation not needed
+      mov(rd, (uint32_t)addr);
+      return;
+    }
+    if (VM_Version::features() & FT_ARMV6T2) {
+      relocate(rspec);
+      int c = (int)addr;
+      movw_i(rd, c & 0xffff);
+      if ((unsigned int)c >> 16) {
+        movt_i(rd, (unsigned int)c >> 16);
+      }
+      return;
+    }
+    Label skip_literal;
+    Label literal;
+    ldr(rd, literal);
+    b(skip_literal);
+    bind(literal);
+    emit_address(addr);
+    bind(skip_literal);
+  }
+
+  void arm_stack_overflow_check(int frame_size_in_bytes, Register tmp);
+  void arm_stack_overflow_check(Register Rsize, Register tmp);
+
+  void mov_relative_address(Register rd, address addr, Condition cond = Assembler::AL) {
+    int offset = addr - pc() - 8;
+    assert((offset & 3) == 0, "bad alignment");
+    if (offset >= 0) {
+      assert(is_valid_for_imm12(offset), "addr too far");
+      add(rd, r15_pc, offset, cond);
+    } else {
+      assert(is_valid_for_imm12(-offset), "addr too far");
+      sub(rd, r15_pc, -offset, cond);
+    }
+  }
+
+  void floating_cmp(Register dst);
+
+  void fast_lock(Register Roop, Register Rbox, Register Rmark, Register Rscratch, Register Rscratch2);
+  void fast_unlock(Register Roop, Register Rbox, Register Rscratch, Register Rscratch2);
+#endif
+};
+
+
+#ifdef ASSERT
+inline bool AbstractAssembler::pd_check_instruction_mark() { return false; }
+#endif
+
+/**
+ * class SkipIfEqual:
+ *
+ * Instantiating this class will result in assembly code being output that will
+ * jump around any code emitted between the creation of the instance and it's
+ * automatic destruction at the end of a scope block, depending on the value of
+ * the flag passed to the constructor, which will be checked at run-time.
+ */
+class SkipIfEqual {
+ private:
+  MacroAssembler* _masm;
+  Label _label;
+
+ public:
+   SkipIfEqual(MacroAssembler*, const bool* flag_addr, bool value);
+   ~SkipIfEqual();
+};
+
+struct tableswitch {
+  Register _reg;
+  int _insn_index;
+  jint _first_key;
+  jint _last_key;
+  Label _after;
+  Label _branches;
+};
+
+#endif // CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:12.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/macroAssembler_aarch32.inline.hpp	2018-09-25 19:25:12.000000000 +0300
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_INLINE_HPP
+#define CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_INLINE_HPP
+
+#include "asm/assembler.hpp"
+
+#ifndef PRODUCT
+
+#endif // ndef PRODUCT
+
+#endif // CPU_AARCH32_VM_MACROASSEMBLER_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:13.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/methodHandles_aarch32.cpp	2018-09-25 19:25:13.000000000 +0300
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "classfile/javaClasses.inline.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "memory/allocation.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/flags/flagSetting.hpp"
+#include "runtime/frame.inline.hpp"
+
+#define __ _masm->
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+void MethodHandles::load_klass_from_Class(MacroAssembler* _masm, Register klass_reg) {
+  if (VerifyMethodHandles)
+    verify_klass(_masm, klass_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_Class),
+                 "MH argument is a Class");
+  __ ldr(klass_reg, Address(klass_reg, java_lang_Class::klass_offset_in_bytes()));
+}
+
+#ifdef ASSERT
+static int check_nonzero(const char* xname, int x) {
+  assert(x != 0, "%s should be nonzero", xname);
+  return x;
+}
+#define NONZERO(x) check_nonzero(#x, x)
+#else //ASSERT
+#define NONZERO(x) (x)
+#endif //PRODUCT
+
+#ifdef ASSERT
+void MethodHandles::verify_klass(MacroAssembler* _masm,
+                                 Register obj, SystemDictionary::WKID klass_id,
+                                 const char* error_message) {
+  // FIXME Did this code ever work?
+  // or have I changed the working of cmpptr?
+  // previously cmpptr took the klass_addr, did it also do dereference before the comparison?
+  InstanceKlass** klass_addr = SystemDictionary::well_known_klass_addr(klass_id);
+  Klass* klass = SystemDictionary::well_known_klass(klass_id);
+  Register temp = rscratch2;
+  Register temp2 = rscratch1; // used by MacroAssembler::cmpptr
+  Label L_ok, L_bad;
+  BLOCK_COMMENT("verify_klass {");
+  __ verify_oop(obj);
+  __ cbz(obj, L_bad);
+  __ push(RegSet::of(temp, temp2), sp);
+  __ load_klass(temp, obj);
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ b(L_ok, Assembler::EQ);
+  intptr_t super_check_offset = klass->super_check_offset();
+  __ ldr(temp, Address(temp, super_check_offset));
+  __ cmpptr(temp, ExternalAddress((address) klass_addr));
+  __ b(L_ok, Assembler::EQ);
+  __ pop(RegSet::of(temp, temp2), sp);
+  __ bind(L_bad);
+  __ stop(error_message);
+  __ BIND(L_ok);
+  __ pop(RegSet::of(temp, temp2), sp);
+  BLOCK_COMMENT("} verify_klass");
+}
+
+void MethodHandles::verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) {  }
+
+#endif //ASSERT
+
+void MethodHandles::jump_from_method_handle(MacroAssembler* _masm, Register method, Register temp,
+                                            bool for_compiler_entry) {
+  assert(method == rmethod, "interpreter calling convention");
+  Label L_no_such_method;
+  __ cbz(rmethod, L_no_such_method);
+  __ verify_method_ptr(method);
+
+  if (!for_compiler_entry && JvmtiExport::can_post_interpreter_events()) {
+    Label run_compiled_code;
+    // JVMTI events, such as single-stepping, are implemented partly by avoiding running
+    // compiled code in threads for which the event is enabled.  Check here for
+    // interp_only_mode if these events CAN be enabled.
+
+    __ ldrb(rscratch1, Address(rthread, JavaThread::interp_only_mode_offset()));
+    __ cbnz(rscratch1, run_compiled_code);
+    __ ldr(rscratch1, Address(method, Method::interpreter_entry_offset()));
+    __ b(rscratch1);
+    __ BIND(run_compiled_code);
+  }
+
+  const ByteSize entry_offset = for_compiler_entry ? Method::from_compiled_offset() :
+                                                     Method::from_interpreted_offset();
+  __ ldr(rscratch1,Address(method, entry_offset));
+  __ b(rscratch1);
+  __ bind(L_no_such_method);
+  __ far_jump(RuntimeAddress(StubRoutines::throw_AbstractMethodError_entry()));
+}
+
+void MethodHandles::jump_to_lambda_form(MacroAssembler* _masm,
+                                        Register recv, Register method_temp,
+                                        Register temp2,
+                                        bool for_compiler_entry) {
+  BLOCK_COMMENT("jump_to_lambda_form {");
+  // This is the initial entry point of a lazy method handle.
+  // After type checking, it picks up the invoker from the LambdaForm.
+  assert_different_registers(recv, method_temp, temp2);
+  assert(recv != noreg, "required register");
+  assert(method_temp == rmethod, "required register for loading method");
+
+  //NOT_PRODUCT({ FlagSetting fs(TraceMethodHandles, true); trace_method_handle(_masm, "LZMH"); });
+
+  // Load the invoker, as MH -> MH.form -> LF.vmentry
+  __ verify_oop(recv);
+  __ load_heap_oop(method_temp, Address(recv, NONZERO(java_lang_invoke_MethodHandle::form_offset_in_bytes())), temp2);
+  __ verify_oop(method_temp);
+  __ load_heap_oop(method_temp, Address(method_temp, NONZERO(java_lang_invoke_LambdaForm::vmentry_offset_in_bytes())), temp2);
+  __ verify_oop(method_temp);
+  __ load_heap_oop(method_temp, Address(method_temp, NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes())), temp2);
+  __ verify_oop(method_temp);
+  __ access_load_word_at(T_ADDRESS, IN_HEAP, method_temp, Address(method_temp, NONZERO(java_lang_invoke_ResolvedMethodName::vmtarget_offset_in_bytes())), noreg, noreg);
+
+  if (VerifyMethodHandles && !for_compiler_entry) {
+    // make sure recv is already on stack
+    __ ldr(temp2, Address(method_temp, Method::const_offset()));
+    __ load_sized_value(temp2,
+                        Address(temp2, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    Label L;
+    __ ldr(rscratch1, __ argument_address(temp2, -1));
+    __ cmpoop(recv, rscratch1);
+    __ b(L, Assembler::EQ);
+    __ ldr(r0, __ argument_address(temp2, -1));
+    __ hlt(0);
+    __ BIND(L);
+  }
+
+  jump_from_method_handle(_masm, method_temp, temp2, for_compiler_entry);
+  BLOCK_COMMENT("} jump_to_lambda_form");
+}
+
+// Code generation
+address MethodHandles::generate_method_handle_interpreter_entry(MacroAssembler* _masm,
+                                                                vmIntrinsics::ID iid) {
+  const bool not_for_compiler_entry = false;  // this is the interpreter entry
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  if (iid == vmIntrinsics::_invokeGeneric ||
+      iid == vmIntrinsics::_compiledLambdaForm) {
+    // Perhaps surprisingly, the symbolic references visible to Java are not directly used.
+    // They are linked to Java-generated adapters via MethodHandleNatives.linkMethod.
+    // They all allow an appendix argument.
+    __ hlt(0);           // empty stubs make SG sick
+    return NULL;
+  }
+
+  // rmethod: Method*
+  // r3: argument locator (parameter slot count, added to rsp)
+  // r1: used as temp to hold mh or receiver
+  // r0, r11: garbage temps, blown away
+  Register argp   = r3;   // argument list ptr, live on error paths
+  Register temp   = r0;
+  Register mh     = r1;   // MH receiver; dies quickly and is recycled
+
+  // here's where control starts out:
+  __ align(CodeEntryAlignment);
+  address entry_point = __ pc();
+
+  if (VerifyMethodHandles) {
+    assert(Method::intrinsic_id_size_in_bytes() == 2, "assuming Method::_intrinsic_id is u2");
+    Label L;
+    BLOCK_COMMENT("verify_intrinsic_id {");
+    __ ldrh(rscratch1, Address(rmethod, Method::intrinsic_id_offset_in_bytes()));
+    __ cmp(rscratch1, (int) iid, temp);
+    __ b(L, Assembler::EQ);
+    if (iid == vmIntrinsics::_linkToVirtual ||
+        iid == vmIntrinsics::_linkToSpecial) {
+      // could do this for all kinds, but would explode assembly code size
+      trace_method_handle(_masm, "bad Method*::intrinsic_id");
+    }
+    __ hlt(0);
+    __ bind(L);
+    BLOCK_COMMENT("} verify_intrinsic_id");
+  }
+
+  // First task:  Find out how big the argument list is.
+  Address r3_first_arg_addr;
+  int ref_kind = signature_polymorphic_intrinsic_ref_kind(iid);
+  assert(ref_kind != 0 || iid == vmIntrinsics::_invokeBasic, "must be _invokeBasic or a linkTo intrinsic");
+  if (ref_kind == 0 || MethodHandles::ref_kind_has_receiver(ref_kind)) {
+    __ ldr(argp, Address(rmethod, Method::const_offset()));
+    __ load_sized_value(argp,
+                        Address(argp, ConstMethod::size_of_parameters_offset()),
+                        sizeof(u2), /*is_signed*/ false);
+    // assert(sizeof(u2) == sizeof(Method::_size_of_parameters), "");
+    r3_first_arg_addr = __ argument_address(argp, -1);
+  } else {
+    DEBUG_ONLY(argp = noreg);
+  }
+
+  if (!is_signature_polymorphic_static(iid)) {
+    __ ldr(mh, r3_first_arg_addr);
+    DEBUG_ONLY(argp = noreg);
+  }
+
+  // r3_first_arg_addr is live!
+
+  trace_method_handle_interpreter_entry(_masm, iid);
+  if (iid == vmIntrinsics::_invokeBasic) {
+    generate_method_handle_dispatch(_masm, iid, mh, noreg, not_for_compiler_entry);
+
+  } else {
+    // Adjust argument list by popping the trailing MemberName argument.
+    Register recv = noreg;
+    if (MethodHandles::ref_kind_has_receiver(ref_kind)) {
+      // Load the receiver (not the MH; the actual MemberName's receiver) up from the interpreter stack.
+      __ ldr(recv = r2, r3_first_arg_addr);
+    }
+    DEBUG_ONLY(argp = noreg);
+    Register rmember = rmethod;  // MemberName ptr; incoming method ptr is dead now
+    __ pop(rmember);             // extract last argument
+    generate_method_handle_dispatch(_masm, iid, recv, rmember, not_for_compiler_entry);
+  }
+
+  return entry_point;
+}
+
+
+void MethodHandles::generate_method_handle_dispatch(MacroAssembler* _masm,
+                                                    vmIntrinsics::ID iid,
+                                                    Register receiver_reg,
+                                                    Register member_reg,
+                                                    bool for_compiler_entry) {
+  assert(is_signature_polymorphic(iid), "expected invoke iid");
+  // temps used in this code are not used in *either* compiled or interpreted calling sequences
+  // use interpreter caching registers (caller-save in compiler).
+  // Starting from r5 as r4 used by gen_special_dispatch.
+  Register temp1 = r5;
+  Register temp2 = r6;
+  Register temp3 = r7;
+  assert_different_registers(temp1, temp2, temp3, receiver_reg, member_reg);
+  if (for_compiler_entry) {
+    assert(receiver_reg == (iid == vmIntrinsics::_linkToStatic ? noreg : j_rarg0), "only valid assignment");
+    assert_different_registers(temp1,        j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+    assert_different_registers(temp2,        j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+    assert_different_registers(temp3,        j_rarg0, j_rarg1, j_rarg2, j_rarg3);
+  }
+
+  assert_different_registers(temp1, temp2, temp3, receiver_reg);
+  assert_different_registers(temp1, temp2, temp3, member_reg);
+
+  if (iid == vmIntrinsics::_invokeBasic) {
+    // indirect through MH.form.vmentry.vmtarget
+    jump_to_lambda_form(_masm, receiver_reg, rmethod, temp1, for_compiler_entry);
+
+  } else {
+    // The method is a member invoker used by direct method handles.
+    if (VerifyMethodHandles) {
+      // make sure the trailing argument really is a MemberName (caller responsibility)
+      verify_klass(_masm, member_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MemberName),
+                   "MemberName required for invokeVirtual etc.");
+    }
+
+    Address member_clazz(    member_reg, NONZERO(java_lang_invoke_MemberName::clazz_offset_in_bytes()));
+    Address member_vmindex(  member_reg, NONZERO(java_lang_invoke_MemberName::vmindex_offset_in_bytes()));
+    Address member_vmtarget( member_reg, NONZERO(java_lang_invoke_MemberName::method_offset_in_bytes()));
+    Address vmtarget_method( rmethod, NONZERO(java_lang_invoke_ResolvedMethodName::vmtarget_offset_in_bytes()));
+
+    Register temp1_recv_klass = temp1;
+    if (iid != vmIntrinsics::_linkToStatic) {
+      __ verify_oop(receiver_reg);
+      if (iid == vmIntrinsics::_linkToSpecial) {
+        // Don't actually load the klass; just null-check the receiver.
+        __ null_check(receiver_reg);
+      } else {
+        // load receiver klass itself
+        __ null_check(receiver_reg, oopDesc::klass_offset_in_bytes());
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      BLOCK_COMMENT("check_receiver {");
+      // The receiver for the MemberName must be in receiver_reg.
+      // Check the receiver against the MemberName.clazz
+      if (VerifyMethodHandles && iid == vmIntrinsics::_linkToSpecial) {
+        // Did not load it above...
+        __ load_klass(temp1_recv_klass, receiver_reg);
+        __ verify_klass_ptr(temp1_recv_klass);
+      }
+      if (VerifyMethodHandles && iid != vmIntrinsics::_linkToInterface) {
+        Label L_ok;
+        Register temp2_defc = temp2;
+        __ load_heap_oop(temp2_defc, member_clazz, temp3);
+        load_klass_from_Class(_masm, temp2_defc);
+        __ verify_klass_ptr(temp2_defc);
+        __ check_klass_subtype(temp1_recv_klass, temp2_defc, temp3, L_ok);
+        // If we get here, the type check failed!
+        __ hlt(0);
+        // __ STOP("receiver class disagrees with MemberName.clazz");
+        __ bind(L_ok);
+      }
+      BLOCK_COMMENT("} check_receiver");
+    }
+    if (iid == vmIntrinsics::_linkToSpecial ||
+        iid == vmIntrinsics::_linkToStatic) {
+      DEBUG_ONLY(temp1_recv_klass = noreg);  // these guys didn't load the recv_klass
+    }
+
+    // Live registers at this point:
+    //  member_reg - MemberName that was the trailing argument
+    //  temp1_recv_klass - klass of stacked receiver, if needed
+    //  r1 ... r0 - compiler arguments (if compiled)
+
+    Label L_incompatible_class_change_error;
+    switch (iid) {
+    case vmIntrinsics::_linkToSpecial:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeSpecial, member_reg, temp3);
+      }
+      __ load_heap_oop(rmethod, member_vmtarget);
+      __ access_load_word_at(T_ADDRESS, IN_HEAP, rmethod, vmtarget_method, noreg, noreg);
+      break;
+
+    case vmIntrinsics::_linkToStatic:
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeStatic, member_reg, temp3);
+      }
+      __ load_heap_oop(rmethod, member_vmtarget);
+      __ access_load_word_at(T_ADDRESS, IN_HEAP, rmethod, vmtarget_method, noreg, noreg);
+      break;
+
+    case vmIntrinsics::_linkToVirtual:
+    {
+      // same as TemplateTable::invokevirtual,
+      // minus the CP setup and profiling:
+
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeVirtual, member_reg, temp3);
+      }
+
+      // pick out the vtable index from the MemberName, and then we can discard it:
+      Register temp2_index = temp2;
+      __ access_load_word_at(T_ADDRESS, IN_HEAP, temp2_index, member_vmindex, noreg, noreg);
+
+      if (VerifyMethodHandles) {
+        Label L_index_ok;
+        __ cmp(temp2_index, 0U);
+        __ b(L_index_ok, Assembler::GE);
+        __ hlt(0);
+        __ BIND(L_index_ok);
+      }
+
+      // Note:  The verifier invariants allow us to ignore MemberName.clazz and vmtarget
+      // at this point.  And VerifyMethodHandles has already checked clazz, if needed.
+
+      // get target Method* & entry point
+      __ lookup_virtual_method(temp1_recv_klass, temp2_index, rmethod);
+      break;
+    }
+
+    case vmIntrinsics::_linkToInterface:
+    {
+      // same as TemplateTable::invokeinterface
+      // (minus the CP setup and profiling, with different argument motion)
+      if (VerifyMethodHandles) {
+        verify_ref_kind(_masm, JVM_REF_invokeInterface, member_reg, temp3);
+      }
+
+      Register temp3_intf = temp3;
+      __ load_heap_oop(temp3_intf, member_clazz);
+      load_klass_from_Class(_masm, temp3_intf);
+      __ verify_klass_ptr(temp3_intf);
+
+      Register rindex = rmethod;
+      __ access_load_word_at(T_ADDRESS, IN_HEAP, rindex, member_vmindex, noreg, noreg);
+      if (VerifyMethodHandles) {
+        Label L;
+        __ cmp(rindex, 0);
+        __ b(L, Assembler::GE);
+        __ hlt(0);
+        __ bind(L);
+      }
+
+      // given intf, index, and recv klass, dispatch to the implementation method
+      __ lookup_interface_method(temp1_recv_klass, temp3_intf,
+                                 // note: next two args must be the same:
+                                 rindex, rmethod,
+                                 temp2,
+                                 L_incompatible_class_change_error);
+      break;
+    }
+
+    default:
+      fatal("unexpected intrinsic %d: %s", iid, vmIntrinsics::name_at(iid));
+      break;
+    }
+
+    // live at this point:  rmethod, r13 (if interpreted)
+
+    // After figuring out which concrete method to call, jump into it.
+    // Note that this works in the interpreter with no data motion.
+    // But the compiled version will require that r2_recv be shifted out.
+    __ verify_method_ptr(rmethod);
+    jump_from_method_handle(_masm, rmethod, temp1, for_compiler_entry);
+    if (iid == vmIntrinsics::_linkToInterface) {
+      __ bind(L_incompatible_class_change_error);
+      __ far_jump(RuntimeAddress(StubRoutines::throw_IncompatibleClassChangeError_entry()));
+    }
+  }
+}
+
+#ifndef PRODUCT
+void trace_method_handle_stub(const char* adaptername,
+                              oop mh,
+                              intptr_t* saved_regs,
+                              intptr_t* entry_sp) {  }
+
+// The stub wraps the arguments in a struct on the stack to avoid
+// dealing with the different calling conventions for passing 6
+// arguments.
+struct MethodHandleStubArguments {
+  const char* adaptername;
+  oopDesc* mh;
+  intptr_t* saved_regs;
+  intptr_t* entry_sp;
+};
+void trace_method_handle_stub_wrapper(MethodHandleStubArguments* args) {  }
+
+void MethodHandles::trace_method_handle(MacroAssembler* _masm, const char* adaptername) {  }
+#endif //PRODUCT
--- /dev/null	2018-09-25 19:25:14.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/methodHandles_aarch32.hpp	2018-09-25 19:25:14.000000000 +0300
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2010, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// Platform-specific definitions for method handles.
+// These definitions are inlined into class MethodHandles.
+
+// Adapters
+enum /* platform_dependent_constants */ {
+  adapter_code_size = 32000 DEBUG_ONLY(+ 120000)
+};
+
+public:
+
+  static void load_klass_from_Class(MacroAssembler* _masm, Register klass_reg);
+
+  static void verify_klass(MacroAssembler* _masm,
+                           Register obj, SystemDictionary::WKID klass_id,
+                           const char* error_message = "wrong klass") NOT_DEBUG_RETURN;
+
+  static void verify_method_handle(MacroAssembler* _masm, Register mh_reg) {
+    verify_klass(_masm, mh_reg, SystemDictionary::WK_KLASS_ENUM_NAME(java_lang_invoke_MethodHandle),
+                 "reference is a MH");
+  }
+
+  static void verify_ref_kind(MacroAssembler* _masm, int ref_kind, Register member_reg, Register temp) NOT_DEBUG_RETURN;
+
+  // Similar to InterpreterMacroAssembler::jump_from_interpreted.
+  // Takes care of special dispatch from single stepping too.
+  static void jump_from_method_handle(MacroAssembler* _masm, Register method, Register temp,
+                                      bool for_compiler_entry);
+
+  static void jump_to_lambda_form(MacroAssembler* _masm,
+                                  Register recv, Register method_temp,
+                                  Register temp2,
+                                  bool for_compiler_entry);
+
+  static Register saved_last_sp_register() {
+    // Should be in sharedRuntime, not here.
+    return noreg;
+  }
--- /dev/null	2018-09-25 19:25:15.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/nativeInst_aarch32.cpp	2018-09-25 19:25:15.000000000 +0300
@@ -0,0 +1,719 @@
+/*
+ * Copyright (c) 1997, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2018, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "code/codeCache.hpp"
+#include "memory/resourceArea.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/handles.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "utilities/ostream.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+
+// LIRAssembler fills patching site with nops up to NativeCall::instruction_size
+int NativeCall::instruction_size = 5 * arm_insn_sz;
+
+NativeInstruction* NativeInstruction::from(address addr) {
+  return (NativeInstruction*) addr;
+}
+
+//-------------------------------------------------------------------
+
+void NativeCall::init() {
+  instruction_size = (VM_Version::features() & (FT_ARMV6T2 | FT_ARMV7) ? 3 : 5) * arm_insn_sz;
+}
+
+void NativeCall::verify() {
+  if (!is_call()) {
+    fatal("not a call");
+  }
+}
+
+address NativeCall::destination() const {
+  assert(is_call(), "not a call");
+  if (NativeImmCall::is_at(addr())) {
+    return NativeImmCall::from(addr())->destination();
+  } else if (NativeMovConstReg::is_at(addr())) {
+    return address(NativeMovConstReg::from(addr())->data());
+  } else if (NativeTrampolineCall::is_at(addr())) {
+    return NativeTrampolineCall::from(addr())->destination();
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
+
+void NativeCall::set_destination(address dest) {
+  assert(is_call(), "not a call");
+  if (NativeImmCall::is_at(addr())) {
+    NativeImmCall::from(addr())->set_destination(dest);
+  } else if (NativeMovConstReg::is_at(addr())) {
+    NativeMovConstReg::from(addr())->set_data((uintptr_t) dest);
+  } else if (NativeTrampolineCall::is_at(addr())) {
+    NativeTrampolineCall::from(addr())->set_destination(dest);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void NativeCall::set_destination_mt_safe(address dest, bool assert_lock) {
+  assert(is_call(), "not a call");
+
+  // patching should be not only safe (i.e. this call could be executed by some thread),
+  // but it also should be atomic (some other thread could call NativeCall::destination()
+  // and see valid destination value)
+
+  if (NativeImmCall::is_at(addr())) {
+    NativeImmCall::from(addr())->set_destination(dest);
+    ICache::invalidate_word(addr());
+  } else if (NativeTrampolineCall::is_at(addr())) {
+    NativeTrampolineCall::from(addr())->set_destination_mt_safe(dest);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+void NativeCall::insert(address code_pos, address entry) {
+  Unimplemented();
+}
+
+bool NativeCall::is_call_before(address return_address) {
+  if (NativeTrampolineCall::is_at(return_address - NativeCall::instruction_size)) {
+    return true;
+  }
+
+  if (NativeMovConstReg::is_at(return_address - NativeCall::instruction_size)) {
+    NativeMovConstReg *nm = NativeMovConstReg::from(return_address - NativeCall::instruction_size);
+    address next_instr = nm->next_instruction_address();
+    if (NativeRegCall::is_at(next_instr) && NativeRegCall::from(next_instr)->destination() == nm->destination()) {
+      return true;
+    }
+  }
+
+  if (NativeImmCall::is_at(return_address - NativeBranchType::instruction_size)) {
+    return true;
+  }
+
+  return false;
+}
+
+address NativeCall::next_instruction_address() const {
+  assert(is_call(), "not a call");
+  if (NativeImmCall::is_at(addr())) {
+    return NativeImmCall::from(addr())->next_instruction_address();
+  } else if (NativeMovConstReg::is_at(addr())) {
+    NativeMovConstReg *nm = NativeMovConstReg::from(addr());
+    address next_instr = nm->next_instruction_address();
+    assert(NativeRegCall::is_at(next_instr), "should be");
+    return NativeRegCall::from(next_instr)->next_instruction_address();
+  } else if (NativeTrampolineCall::is_at(addr())) {
+    return NativeTrampolineCall::from(addr())->next_instruction_address();
+  } else {
+    ShouldNotReachHere();
+    return NULL;
+  }
+}
+
+address NativeCall::return_address() const {
+  return next_instruction_address();
+}
+
+bool NativeCall::is_at(address addr) {
+  if (NativeImmCall::is_at(addr)) {
+    return true;
+  }
+
+  if (NativeMovConstReg::is_at(addr)) {
+    NativeMovConstReg *nm = NativeMovConstReg::from(addr);
+    address next_instr = nm->next_instruction_address();
+    if (NativeRegCall::is_at(next_instr) &&
+	NativeRegCall::from(next_instr)->destination() == nm->destination()) {
+      return true;
+    }
+  }
+
+  if (NativeTrampolineCall::is_at(addr)) {
+    return true;
+  }
+
+  return false;
+}
+
+NativeCall* NativeCall::from(address addr) {
+  assert(NativeCall::is_at(addr), "");
+  return (NativeCall*) addr;
+}
+
+//-------------------------------------------------------------------
+
+address NativeTrampolineCall::destination() const {
+  assert(is_at(addr()), "not call");
+  return (address) uint_at(8);
+}
+
+void NativeTrampolineCall::set_destination(address dest) {
+  assert(is_at(addr()), "not call");
+  set_uint_at(8, (uintptr_t) dest);
+}
+
+void NativeTrampolineCall::set_destination_mt_safe(address dest, bool assert_lock) {
+  assert(is_at(addr()), "not call");
+  set_destination(dest);
+  ICache::invalidate_word(addr() + 8);
+}
+
+bool NativeTrampolineCall::is_at(address addr) {
+  return (as_uint(addr    ) & ~0xffu) == 0xe28fe000 // add     lr, pc, #disp
+       && as_uint(addr + 4)          == 0xe51ff004; // ldr     pc, [pc, -4]
+}
+
+NativeTrampolineCall* NativeTrampolineCall::from(address addr) {
+  assert(NativeTrampolineCall::is_at(addr), "");
+  return (NativeTrampolineCall*) addr;
+}
+
+//-------------------------------------------------------------------
+
+address NativeImmCall::destination() const {
+  assert(is_imm_call(), "not call");
+  uint32_t insn = as_uint();
+  intptr_t off = Instruction_aarch32::sextract(insn, 23, 0);
+  address destination = addr() + 8 + (off << 2);
+  return destination;
+}
+
+void NativeImmCall::set_destination(address dest) {
+  assert(is_imm_call(), "not call");
+  patch_offset_to(dest);
+}
+
+bool NativeImmCall::is_at(address addr) {
+  return Instruction_aarch32::extract(as_uint(addr), 27, 24)  == 0b1011;
+}
+
+NativeImmCall* NativeImmCall::from(address addr) {
+  assert(NativeImmCall::is_at(addr), "");
+  return (NativeImmCall*) addr;
+}
+
+//-------------------------------------------------------------------
+
+Register NativeRegCall::destination() const {
+  assert(is_reg_call(), "not call");
+  return (Register) Instruction_aarch32::extract(as_uint(), 3, 0);
+}
+
+bool NativeRegCall::is_at(address addr) {
+  unsigned insn = as_uint(addr);
+  return is_branch_type(insn) && Instruction_aarch32::extract(insn, 7, 4) == 0b0011;
+}
+
+NativeRegCall* NativeRegCall::from(address addr) {
+  assert(NativeRegCall::is_at(addr), "");
+  return (NativeRegCall*) addr;
+}
+
+//-------------------------------------------------------------------
+
+address NativeFarLdr::skip_patching_prolog(address addr) {
+  if (NativeInstruction::from(addr)->is_nop() &&
+      NativeInstruction::from(addr + arm_insn_sz)->is_barrer()) {
+    return addr+2*arm_insn_sz;
+  }
+  return addr;
+}
+
+bool NativeFarLdr::is_at(address addr) {
+  addr = skip_patching_prolog(addr);
+  unsigned add_condidate = as_uint(addr);
+  if (((Instruction_aarch32::extract(add_condidate, 27, 21)  != 0b0010100) /*add*/ &&
+        (Instruction_aarch32::extract(add_condidate, 27, 21) != 0b0010010) /*sub*/) ||
+      (Instruction_aarch32::extract(add_condidate, 19, 16) != (unsigned) r15_pc->encoding())) {
+    return false;
+  }
+  Register dest = as_Register(Instruction_aarch32::extract(add_condidate, 15, 12));
+  return NativeMovConstReg::is_ldr_literal_at(addr + arm_insn_sz, dest);
+}
+
+NativeFarLdr* NativeFarLdr::from(address addr) {
+  assert(is_at(addr), "");
+  return (NativeFarLdr*) addr;
+}
+
+intptr_t* NativeFarLdr::data_addr() {
+  address self = skip_patching_prolog(addr());
+  off_t offset = 8;
+  off_t add_off = Assembler::decode_imm12(as_uint(self) & 0xfff);
+  if (Instruction_aarch32::extract(as_uint(self), 24, 21) == 0x4) {
+    offset += add_off;
+  } else {
+    offset -= add_off;
+  }
+  off_t ldr_off = as_uint(self + arm_insn_sz) & 0xfff;
+  if (Instruction_aarch32::extract(as_uint(self), 23, 23)) {
+    offset += ldr_off;
+  } else {
+    offset -= ldr_off;
+  }
+
+  return (intptr_t*)(self + offset);
+}
+
+void NativeFarLdr::set_data_addr(intptr_t *data_addr) {
+  address self = skip_patching_prolog(addr());
+  off_t offset = (address)data_addr - (self + 8);
+  bool minus = false;
+  if (offset < 0) {
+    offset = -offset;
+    minus = true;
+  }
+  guarantee((0 <= offset) && (offset <= 0xffffff), "offset too large");
+  set_uint_at(self - addr(), (as_uint(self) & ~0xc00fff) |
+    (minus ? 0x400000u /*sub*/ : 0x800000u /*add*/) |
+    Assembler::encode_imm12(offset & 0xff000));
+
+  set_uint_at(self - addr() + arm_insn_sz,
+      (as_uint(self + arm_insn_sz) & ~0x800fff) |
+      (minus ? 0x000000 : 0x800000) |
+      (offset & 0xfff));
+  ICache::invalidate_range(self, 2*arm_insn_sz);
+}
+
+address NativeFarLdr::next_instruction_address() const {
+  return skip_patching_prolog(addr()) + NativeMovConstReg::far_ldr_sz;
+}
+
+//-------------------------------------------------------------------
+
+void NativeMovConstReg::verify() {
+  if (!is_mov_const_reg()) {
+    fatal("not a mov const reg");
+  }
+}
+
+intptr_t NativeMovConstReg::data() const {
+  if (NativeFarLdr::is_at(addr())) {
+    return *NativeFarLdr::from(addr())->data_addr();
+  }
+  return (intptr_t) MacroAssembler::target_addr_for_insn(addr());
+}
+
+void NativeMovConstReg::set_data(intptr_t x) {
+  if (NativeFarLdr::is_at(addr())) {
+    *NativeFarLdr::from(addr())->data_addr() = x;
+    // Fences should be provided by calling code!
+  } else {
+    // Store x into the instruction stream.
+    MacroAssembler::pd_patch_instruction(addr(), (address)x);
+    ICache::invalidate_range(addr(), max_instruction_size);
+  }
+
+  // Find and replace the oop/metadata corresponding to this
+  // instruction in oops section.
+  CodeBlob* cb = CodeCache::find_blob(addr());
+  nmethod* nm = cb->as_nmethod_or_null();
+  if (nm != NULL) {
+    RelocIterator iter(nm, addr(), next_instruction_address());
+    while (iter.next()) {
+      if (iter.type() == relocInfo::oop_type) {
+        oop* oop_addr = iter.oop_reloc()->oop_addr();
+        *oop_addr = cast_to_oop(x);
+        break;
+      } else if (iter.type() == relocInfo::metadata_type) {
+        Metadata** metadata_addr = iter.metadata_reloc()->metadata_addr();
+        *metadata_addr = (Metadata*)x;
+        break;
+      }
+    }
+  }
+}
+
+void NativeMovConstReg::print() {
+  tty->print_cr(PTR_FORMAT ": mov reg, " INTPTR_FORMAT,
+                p2i(addr()), data());
+}
+
+Register NativeMovConstReg::destination() const {
+  return (Register) Instruction_aarch32::extract(as_uint(), 15, 12);
+}
+
+NativeMovConstReg* NativeMovConstReg::from(address addr) {
+  assert(NativeMovConstReg::is_at(addr), "");
+  return (NativeMovConstReg*) addr;
+}
+
+bool NativeMovConstReg::is_ldr_literal_at(address addr, Register from) {
+  unsigned insn = as_uint(addr);
+  if (from == noreg) {
+    return (Instruction_aarch32::extract(insn, 27, 20) & 0b11100101) == 0b01000001;
+  }
+  unsigned reg = from->encoding();
+  return (Instruction_aarch32::extract(insn, 27, 16) & 0b111001011111) == (0b010000010000 | reg);
+}
+
+bool NativeMovConstReg::is_far_ldr_literal_at(address addr) {
+  return NativeFarLdr::is_at(addr);
+}
+
+bool NativeMovConstReg::is_movw_movt_at(address addr) {
+  unsigned insn = as_uint(addr);
+  unsigned insn2 = as_uint(addr + arm_insn_sz);
+  return Instruction_aarch32::extract(insn,  27, 20) == 0b00110000 && //mov
+         Instruction_aarch32::extract(insn2, 27, 20) == 0b00110100;   //movt
+}
+
+bool NativeMovConstReg::is_mov_n_three_orr_at(address addr) {
+  return (Instruction_aarch32::extract(as_uint(addr), 27, 16) & 0b111111101111) == 0b001110100000 &&
+          Instruction_aarch32::extract(as_uint(addr+arm_insn_sz), 27, 20) == 0b00111000 &&
+          Instruction_aarch32::extract(as_uint(addr+2*arm_insn_sz), 27, 20) == 0b00111000 &&
+          Instruction_aarch32::extract(as_uint(addr+3*arm_insn_sz), 27, 21) == 0b0011100;
+}
+
+bool NativeMovConstReg::is_at(address addr) {
+  return is_ldr_literal_at(addr) ||
+          is_far_ldr_literal_at(addr) ||
+          is_movw_movt_at(addr) ||
+          is_mov_n_three_orr_at(addr);
+}
+
+//-------------------------------------------------------------------
+address NativeMovRegMem::instruction_address() const {
+  return addr();
+}
+
+int NativeMovRegMem::offset() const  {
+  assert(NativeMovConstReg::is_at(addr()), "no others");
+  return NativeMovConstReg::from(addr())->data();
+}
+
+void NativeMovRegMem::set_offset(int x) {
+  assert(NativeMovConstReg::is_at(addr()), "no others");
+  NativeMovConstReg::from(addr())->set_data(x);
+}
+
+void NativeMovRegMem::verify() {
+  assert(NativeMovConstReg::is_at(addr()), "no others");
+}
+
+//--------------------------------------------------------------------------------
+
+void NativeJump::verify() {
+  if (!is_jump()) {
+    fatal("not a call");
+  }
+}
+
+void NativeJump::check_verified_entry_alignment(address entry, address verified_entry) {
+}
+
+address NativeJump::jump_destination() const {
+  assert(is_jump(), "not a call");
+  if (NativeImmJump::is_at(addr())) {
+    return NativeImmJump::from(addr())->destination();
+  } else if (NativeMovConstReg::is_at(addr())) {
+    return address(NativeMovConstReg::from(addr())->data());
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
+
+void NativeJump::set_jump_destination(address dest) {
+  assert(is_jump(), "not a call");
+  if (NativeImmJump::is_at(addr())) {
+    NativeImmJump::from(addr())->set_destination(dest);
+  } else if (NativeMovConstReg::is_at(addr())) {
+    NativeMovConstReg::from(addr())->set_data((uintptr_t) dest);
+  } else {
+    ShouldNotReachHere();
+  }
+}
+
+address NativeJump::next_instruction_address() const {
+  assert(is_jump(), "not a call");
+  if (NativeImmJump::is_at(addr())) {
+    return NativeImmJump::from(addr())->next_instruction_address();
+  } else if (NativeMovConstReg::is_at(addr())) {
+    address after_move = NativeMovConstReg::from(addr())->next_instruction_address();
+    assert(NativeRegJump::is_at(after_move), "should be jump");
+    return NativeRegJump::from(after_move)->next_instruction_address();
+  }
+  ShouldNotReachHere();
+  return NULL;
+}
+
+bool NativeJump::is_at(address addr) {
+  if (NativeImmJump::is_at(addr)) {
+    return true;
+  }
+  if (NativeMovConstReg::is_at(addr)) {
+    NativeMovConstReg *nm = NativeMovConstReg::from(addr);
+    address next_instr = nm->next_instruction_address();
+    return NativeRegJump::is_at(next_instr) &&
+      NativeRegJump::from(next_instr)->destination() == nm->destination();
+  }
+  return false;
+}
+
+NativeJump* NativeJump::from(address addr) {
+  assert(NativeJump::is_at(addr), "");
+  return (NativeJump*) addr;
+}
+
+// MT-safe inserting of a jump over a jump or a nop (used by
+// nmethod::make_not_entrant_or_zombie)
+
+void NativeJump::patch_verified_entry(address entry, address verified_entry, address dest) {
+
+  assert(dest == SharedRuntime::get_handle_wrong_method_stub(),
+     "expected fixed destination of patch");
+  assert(NativeInstruction::from(verified_entry)->is_jump_or_nop() ||
+      NativeInstruction::from(verified_entry)->is_sigill_zombie_not_entrant(),
+         "Aarch32 cannot replace non-jump with jump");
+
+  // Patch this nmethod atomically.
+  if (Assembler::reachable_from_branch_at(verified_entry, dest)) {
+    assert((((intptr_t) dest & 0x3) == 0) && (((intptr_t) verified_entry & 0x3) == 0),
+        "addresses should be aligned on 4");
+    ptrdiff_t disp = (dest - verified_entry - 8) >> 2;
+    guarantee((-(1 << 23) <= disp) && (disp < (1 << 23)), "branch overflow");
+
+    unsigned int insn = (0b11101010 << 24) | (disp & 0xffffff);
+    *(unsigned int*)verified_entry = insn;
+  } else {
+    // We use an illegal instruction for marking a method as
+    // not_entrant or zombie.
+    NativeIllegalInstruction::insert(verified_entry);
+  }
+
+  ICache::invalidate_range(verified_entry, instruction_size);
+}
+
+//-------------------------------------------------------------------
+
+bool NativeBranchType::is_branch_type(uint32_t insn) {
+  return Instruction_aarch32::extract(insn, 27, 20) == 0b00010010 &&
+    Instruction_aarch32::extract(insn, 19, 8) == 0b111111111111;
+}
+
+void NativeBranchType::patch_offset_to(address dest) {
+  uint32_t insn = as_uint();
+  const intptr_t off = (dest - (addr() + 8));
+  assert((off & 3) == 0, "should be");
+  assert(-32 * 1024 * 1024 <= off && off < 32 * 1024 * 1042,
+      "new offset should fit in instruction");
+
+  const unsigned off_mask = ((1U << 24) - 1);
+  insn &= ~off_mask; // mask off offset part
+  insn |= ((unsigned) off >> 2) & off_mask;
+
+  set_uint(insn);
+  ICache::invalidate_range(addr_at(0), instruction_size);
+}
+
+//-------------------------------------------------------------------
+
+address NativeImmJump::destination() const {
+  assert(is_imm_jump(), "not jump");
+  return addr() + 8 + 4 * Instruction_aarch32::sextract(as_uint(), 23, 0);
+}
+
+void NativeImmJump::set_destination(address addr) {
+  assert(is_imm_jump(), "");
+  patch_offset_to(addr);
+}
+
+bool NativeImmJump::is_at(address addr) {
+  unsigned insn = as_uint(addr);
+  return Instruction_aarch32::extract(insn, 27, 24)  == 0b1010;
+}
+
+NativeImmJump* NativeImmJump::from(address addr) {
+  assert(NativeImmJump::is_at(addr), "");
+  return (NativeImmJump*) addr;
+}
+
+//-------------------------------------------------------------------
+
+bool NativeRegJump::is_at(address addr) {
+  unsigned insn = as_uint(addr);
+  return is_branch_type(insn) && Instruction_aarch32::extract(insn, 7, 4) == 0b0001;
+}
+
+NativeRegJump* NativeRegJump::from(address addr) {
+  assert(NativeRegJump::is_at(addr), "");
+  return (NativeRegJump*) addr;
+}
+
+Register NativeRegJump::destination() const {
+  assert(is_reg_jump(), "");
+  return (Register) Instruction_aarch32::extract(as_uint(), 3, 0);
+}
+
+//-------------------------------------------------------------------
+
+bool NativeInstruction::is_safepoint_poll() {
+#ifdef COMPILER2_OR_JVMCI
+  // it would be too complex to find the place where poll address is
+  // loaded into address register since C2 can do this somewhere else
+  // so we only checking the exact poll instruction in the form
+  // ldr(r12, [rXXX, #0])
+  return (NativeInstruction::as_uint() & 0xfff0ffff) == 0xe590c000;
+#else
+  // a safepoint_poll is implemented in two steps as
+  //
+  // movw(r12, polling_page & 0xffff);
+  // movt(r12, polling_page >> 16);
+  // ldr(r12, [r12, #0]);
+  //
+  // or, if thread-local handshakes are used
+  //
+  // ldr(r12, [rthread, #offset]);
+  // ldr(r12, [r12, #0]);
+  //
+  //
+  // We can rely on this instructions order since we have only C1
+
+  if (SafepointMechanism::uses_thread_local_poll()) {
+    const Register scratch = rscratch2;
+
+    if (NativeInstruction::from(addr())->is_ldr(scratch, Address(scratch))) {
+      return NativeInstruction::from(addr()-arm_insn_sz)
+        ->is_ldr(scratch, Address(rthread, Thread::polling_page_offset()));
+    }
+  } else {
+    const intptr_t paddr = (intptr_t)os::get_polling_page();
+    const Register scratch = rscratch2;
+
+    if (NativeInstruction::from(addr())->is_ldr(scratch, Address(scratch))) {
+      NativeMovConstReg* mov_const = NativeMovConstReg::before(addr());
+      return (mov_const->data() == paddr) && (mov_const->destination() == scratch);
+    }
+  }
+
+  return false;
+#endif
+}
+
+bool NativeInstruction::is_movt(Register dst, unsigned imm, Assembler::Condition cond) {
+  bool a1 = Instruction_aarch32::extract(uint_at(0), 27, 20) == 0b00110100;
+  bool a2 = Instruction_aarch32::extract(uint_at(0), 15, 12) == (unsigned)dst;
+  bool a3 = Instruction_aarch32::extract(uint_at(0), 11, 0) == ((unsigned)imm & 0xfff);
+  bool a4 = Instruction_aarch32::extract(uint_at(0), 19, 16) == ((unsigned)imm >> 12);
+  bool a5 = Instruction_aarch32::extract(uint_at(0), 31, 28) == (unsigned)cond;
+
+  return a1 && a2 && a3 && a4 && a5;
+}
+
+bool NativeInstruction::is_movw(Register dst, unsigned imm, Assembler::Condition cond) {
+  bool a1 = Instruction_aarch32::extract(uint_at(0), 27, 20) == 0b00110000;
+  bool a2 = Instruction_aarch32::extract(uint_at(0), 15, 12) == (unsigned)dst;
+  bool a3 = Instruction_aarch32::extract(uint_at(0), 11, 0) == ((unsigned)imm & 0xfff);
+  bool a4 = Instruction_aarch32::extract(uint_at(0), 19, 16) == ((unsigned)imm >> 12);
+  bool a5 = Instruction_aarch32::extract(uint_at(0), 31, 28) == (unsigned)cond;
+
+  return a1 && a2 && a3 && a4 && a5;
+}
+
+bool NativeInstruction::is_ldr(Register dst, Address addr, Assembler::Condition cond) {
+    assert(addr.get_mode() == Address::imm, "unimplemented");
+    assert(addr.get_wb_mode() == Address::off, "unimplemented");
+    assert(addr.index() == noreg, "unimplemented");
+    assert(addr.offset() == 0, "unimplemented");
+
+    bool b0 = Instruction_aarch32::extract(uint_at(0), 24, 24) == 1; //P
+    bool b1 = Instruction_aarch32::extract(uint_at(0), 23, 23) == 1; //U
+    bool b2 = Instruction_aarch32::extract(uint_at(0), 21, 21) == 0; //W
+    bool b3 = Instruction_aarch32::extract(uint_at(0), 19, 16) == (unsigned)addr.base();
+    bool b4 = Instruction_aarch32::extract(uint_at(0), 11, 0) == 0;
+
+    bool a1 = b0 && b1 && b2 && b3 && b4; //Address encoding
+
+    bool a2 = Instruction_aarch32::extract(uint_at(0), 15, 12) == (unsigned)dst;
+    bool a3 = Instruction_aarch32::extract(uint_at(0), 20, 20) == 1;
+    bool a4 = Instruction_aarch32::extract(uint_at(0), 22, 22) == 0;
+    bool a5 = Instruction_aarch32::extract(uint_at(0), 27, 25) == 0b010;
+    bool a6 = Instruction_aarch32::extract(uint_at(0), 31, 28) == (unsigned)cond;
+
+    return a1 && a2 && a3 && a4 && a5 && a6;
+}
+
+
+bool NativeInstruction::is_movt() {
+  return Instruction_aarch32::extract(int_at(0), 27, 20) == 0b00110100;
+}
+
+bool NativeInstruction::is_orr() {
+  return Instruction_aarch32::extract(int_at(0), 27, 21) == 0b0011100;
+}
+
+bool NativeInstruction::is_sigill_zombie_not_entrant() {
+  return as_uint() == 0xe7fdeafd; // udf #0xdead
+}
+
+void NativeIllegalInstruction::insert(address code_pos) {
+  *(juint*)code_pos = 0xe7fdeafd; // udf #0xdead
+}
+
+//-------------------------------------------------------------------
+
+void NativeGeneralJump::verify() {  }
+
+void NativeGeneralJump::insert_unconditional(address code_pos, address entry) {
+  NativeGeneralJump* n_jump = (NativeGeneralJump*)code_pos;
+  assert(n_jump->is_nop() || n_jump->is_imm_jump(), "not overwrite whats not supposed");
+
+  CodeBuffer cb(code_pos, instruction_size);
+  MacroAssembler a(&cb);
+
+  a.b(entry);
+
+  ICache::invalidate_range(code_pos, instruction_size);
+}
+
+// MT-safe patching of a long jump instruction.
+void NativeGeneralJump::replace_mt_safe(address instr_addr, address code_buffer) {
+  if (NativeFarLdr::is_at(instr_addr+2*arm_insn_sz)) {
+    assert(NativeInstruction::from(code_buffer)->is_nop(), "code_buffer image");
+    assert(NativeImmJump::is_at(instr_addr), "instr_image image");
+    // first 'b' prevents NativeFarLdr to recognize patching_prolog, skip it manually
+    address load_instr = instr_addr+2*arm_insn_sz;
+
+    NativeFarLdr::from(load_instr)->set_data_addr(NativeFarLdr::from(code_buffer)->data_addr());
+
+    WRITE_MEM_BARRIER;
+    *(uintptr_t*)instr_addr = *(uintptr_t*)code_buffer;
+    ICache::invalidate_word(instr_addr);
+
+    assert(NativeFarLdr::is_at(instr_addr), "now valid constant loading");
+  } else {
+    ShouldNotReachHere();
+  }
+}
--- /dev/null	2018-09-25 19:25:16.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/nativeInst_aarch32.hpp	2018-09-25 19:25:16.000000000 +0300
@@ -0,0 +1,542 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_NATIVEINST_AARCH32_HPP
+#define CPU_AARCH32_VM_NATIVEINST_AARCH32_HPP
+
+#include "asm/assembler.hpp"
+#include "runtime/icache.hpp"
+#include "runtime/os.hpp"
+
+// We have interfaces for the following instructions:
+// - NativeInstruction
+// - - NativeCall
+// - - NativeMovConstReg
+// - - NativeMovRegMem
+// - - NativeMovRegMemPatching
+// - - NativeJump
+// - - NativeIllegalOpCode
+// - - NativeGeneralJump
+// - - NativeReturn
+// - - NativeReturnX (return with argument)
+// - - NativePushConst
+// - - NativeTstRegMem
+
+// The base class for different kinds of native instruction abstractions.
+// Provides the primitive operations to manipulate code relative to this.
+
+class NativeInstruction {
+  friend class Relocation;
+  friend bool is_NativeCallTrampolineStub_at(address);
+ public:
+  enum { arm_insn_sz = 4 };
+
+  inline bool is_nop();
+  inline bool is_barrer();
+  inline bool is_illegal();
+  inline bool is_return();
+  inline bool is_jump_or_nop();
+  inline bool is_cond_jump();
+  bool is_safepoint_poll();
+  bool is_movt();
+  bool is_orr();
+  bool is_sigill_zombie_not_entrant();
+
+  bool is_movt(Register dst, unsigned imm, Assembler::Condition cond = Assembler::C_DFLT);
+  bool is_movw(Register dst, unsigned imm, Assembler::Condition cond = Assembler::C_DFLT);
+  bool is_ldr(Register dst, Address addr, Assembler::Condition cond = Assembler::C_DFLT);
+
+  inline bool is_jump() const;
+  inline bool is_call() const;
+
+  inline bool is_mov_const_reg() const;
+  inline bool is_reg_call() const;
+  inline bool is_imm_call() const;
+  inline bool is_reg_jump() const;
+  inline bool is_imm_jump() const;
+
+ protected:
+  address addr() const { return address(this); }
+  // TODO remove this, every command is 4byte long
+
+  address addr_at(int offset) const    { return addr() + offset; }
+
+  s_char sbyte_at(int offset) const    { return *(s_char*) addr_at(offset); }
+  u_char ubyte_at(int offset) const    { return *(u_char*) addr_at(offset); }
+
+  jint int_at(int offset) const        { return *(jint*) addr_at(offset); }
+  juint uint_at(int offset) const      { return *(juint*) addr_at(offset); }
+
+  address ptr_at(int offset) const     { return *(address*) addr_at(offset); }
+
+  oop  oop_at (int offset) const       { return *(oop*) addr_at(offset); }
+
+
+  void set_char_at(int offset, char c)        { *addr_at(offset) = (u_char)c; }
+  void set_int_at(int offset, jint  i)        { *(jint*)addr_at(offset) = i; }
+  void set_uint_at(int offset, jint  i)       { *(juint*)addr_at(offset) = i; }
+  void set_ptr_at (int offset, address  ptr)  { *(address*) addr_at(offset) = ptr; }
+  void set_oop_at (int offset, oop  o)        { *(oop*) addr_at(offset) = o; }
+
+  static juint as_uint(address addr) {
+    return *(juint *) addr;
+  }
+
+  juint as_uint() const {
+    return as_uint(addr());
+  }
+
+  void set_uint(juint v) {
+    *(juint *) addr() = v;
+  }
+
+ public:
+
+  // unit test stuff
+  static void test() {}                 // override for testing
+
+  static bool is_at(address address);
+  static NativeInstruction* from(address address);
+
+};
+
+inline NativeInstruction* nativeInstruction_at(address addr) {
+  return NativeInstruction::from(addr);
+}
+
+inline NativeInstruction* nativeInstruction_at(uint32_t *addr) {
+  return NativeInstruction::from(address(addr));
+}
+
+class NativeBranchType: public NativeInstruction {
+ protected:
+  static bool is_branch_type(uint32_t insn);
+  void patch_offset_to(address addr);
+ public:
+  enum {
+    instruction_size = arm_insn_sz,
+  };
+
+  address next_instruction_address() const {
+    return addr() + arm_insn_sz;
+  }
+};
+
+class NativeFarLdr: public NativeInstruction {
+ private:
+   static address skip_patching_prolog(address addr);
+ public:
+   static bool is_at(address addr);
+   static NativeFarLdr* from(address addr);
+   intptr_t *data_addr();
+   void set_data_addr(intptr_t *data_addr);
+   address next_instruction_address() const;
+};
+
+class NativeMovConstReg: public NativeInstruction {
+  friend class Relocation;
+  friend class NativeMovRegMem;
+  friend class NativeGeneralJump;
+  friend class NativeFarLdr;
+
+ protected:
+  static bool is_ldr_literal_at(address instr, Register from = r15_pc);
+  static bool is_far_ldr_literal_at(address instr);
+  static bool is_movw_movt_at(address instr);
+  static bool is_mov_n_three_orr_at(address instr);
+ public:
+  enum {
+    ldr_sz             = 1 * arm_insn_sz,
+    far_ldr_sz         = 2 * arm_insn_sz,
+    movw_movt_pair_sz  = 2 * arm_insn_sz,
+    mov_n_three_orr_sz = 4 * arm_insn_sz,
+    min_instruction_size = 1 * arm_insn_sz,
+    max_instruction_size = 4 * arm_insn_sz,
+  };
+
+  address next_instruction_address() const  {
+    if (is_ldr_literal_at(addr())) {
+      return addr() + ldr_sz;
+    } else if (is_far_ldr_literal_at(addr())) {
+      return NativeFarLdr::from(addr())->next_instruction_address();;
+    } else if (is_movw_movt_at(addr())) {
+      return addr() + movw_movt_pair_sz;
+    } else if (is_mov_n_three_orr_at(addr())) {
+      return addr() + mov_n_three_orr_sz;
+    }
+
+    // Unknown instruction in NativeMovConstReg
+    ShouldNotReachHere();
+    return NULL;
+  }
+
+  intptr_t data() const;
+  void set_data(intptr_t x);
+
+  Register destination() const;
+  void set_destination(Register r);
+
+  void flush() {
+    ICache::invalidate_range(addr(), max_instruction_size);
+  }
+
+  void  verify();
+  void  print();
+
+  // unit test stuff
+  static void test() {}
+
+  // Creation
+  inline friend NativeMovConstReg* nativeMovConstReg_at(address address);
+
+  static NativeMovConstReg* before(address addr) {
+    address mov = NULL;
+    if (is_ldr_literal_at(addr - ldr_sz)) {
+      mov = addr - ldr_sz;
+    } else if (is_far_ldr_literal_at(addr - far_ldr_sz)) {
+      mov = addr - far_ldr_sz;
+    } else if (is_movw_movt_at(addr - movw_movt_pair_sz)) {
+      mov = addr - movw_movt_pair_sz;
+    } else if (is_mov_n_three_orr_at(addr - mov_n_three_orr_sz)) {
+      mov = addr - mov_n_three_orr_sz;
+    }
+    guarantee(mov, "Can't find NativeMovConstReg before");
+    return NativeMovConstReg::from(mov);
+  }
+
+  static bool is_at(address instr);
+  static NativeMovConstReg* from(address addr);
+};
+
+inline NativeMovConstReg* nativeMovConstReg_at(address address) {
+  return NativeMovConstReg::from(address);
+}
+
+class NativeTrampolineCall: public NativeInstruction {
+ public:
+  // NativeTrampolineCall size is always equal to NativeCall::instruction_size
+  address destination() const;
+  void set_destination(address dest);
+  void set_destination_mt_safe(address dest, bool assert_lock = true);
+
+  static bool is_at(address address);
+  static NativeTrampolineCall* from(address address);
+
+  address next_instruction_address() const;
+};
+
+class NativeRegCall: public NativeBranchType {
+ public:
+
+  Register destination() const;
+  void set_destination(Register r);
+
+  static bool is_at(address address);
+  static NativeRegCall* from(address address);
+};
+
+class NativeCall: public NativeInstruction {
+  friend class Relocation;
+ protected:
+  NativeInstruction* is_long_jump_or_call_at(address addr);
+
+  // NativeCall represents:
+  //  NativeImmCall,
+  //  NativeMovConstReg + NativeBranchType,
+  //  NativeTrampolineCall
+ public:
+  enum {
+    max_instruction_size = 5 * arm_insn_sz
+  };
+
+  static int instruction_size;
+#ifdef ASSERT
+  STATIC_ASSERT(NativeMovConstReg::movw_movt_pair_sz
+      + NativeRegCall::instruction_size <= (int) max_instruction_size);
+  STATIC_ASSERT(NativeMovConstReg::mov_n_three_orr_sz
+      + NativeRegCall::instruction_size <= (int) max_instruction_size);
+#endif
+
+  address destination() const;
+  void set_destination(address dest);
+
+  static void init();
+  void  verify_alignment()                       { ; }
+  void  verify();
+  void  print();
+
+  address instruction_address() const       { return addr_at(0); }
+  address next_instruction_address() const;
+  address return_address() const;
+
+  // MT-safe patching of a call instruction.
+  static void insert(address code_pos, address entry);
+
+  // Similar to replace_mt_safe, but just changes the destination.  The
+  // important thing is that free-running threads are able to execute
+  // this call instruction at all times.  If the call is an immediate BL
+  // instruction we can simply rely on atomicity of 32-bit writes to
+  // make sure other threads will see no intermediate states.
+
+  // We cannot rely on locks here, since the free-running threads must run at
+  // full speed.
+  //
+  // Used in the runtime linkage of calls; see class CompiledIC.
+  // (Cf. 4506997 and 4479829, where threads witnessed garbage displacements.)
+
+  // The parameter assert_lock disables the assertion during code generation.
+  void set_destination_mt_safe(address dest, bool assert_lock = true);
+
+  static bool is_at(address instr);
+  static NativeCall* from(address instr);
+
+  static bool is_call_before(address return_address);
+};
+
+inline address NativeTrampolineCall::next_instruction_address() const {
+  assert(is_at(addr()), "not call");
+  return addr() + NativeCall::instruction_size;
+}
+
+inline NativeCall* nativeCall_at(address address) {
+  return NativeCall::from(address);
+}
+
+// An interface for accessing/manipulating native moves of the form:
+//      mov[b/w/l/q] [reg + offset], reg   (instruction_code_reg2mem)
+//      mov[b/w/l/q] reg, [reg+offset]     (instruction_code_mem2reg
+//      mov[s/z]x[w/b/q] [reg + offset], reg
+//      fld_s  [reg+offset]
+//      fld_d  [reg+offset]
+//      fstp_s [reg + offset]
+//      fstp_d [reg + offset]
+//      mov_literal64  scratch,<pointer> ; mov[b/w/l/q] 0(scratch),reg | mov[b/w/l/q] reg,0(scratch)
+//
+// Warning: These routines must be able to handle any instruction sequences
+// that are generated as a result of the load/store byte,word,long
+// macros.  For example: The load_unsigned_byte instruction generates
+// an xor reg,reg inst prior to generating the movb instruction.  This
+// class must skip the xor instruction.
+
+
+// TODO Review
+class NativeMovRegMem: public NativeInstruction {
+ public:
+  enum {
+    instruction_size = 2 * arm_insn_sz, // TODO check this
+  };
+  // helper
+  int instruction_start() const;
+
+  address instruction_address() const;
+
+  address next_instruction_address() const;
+
+  int   offset() const;
+
+  void  set_offset(int x);
+
+  void  add_offset_in_bytes(int add_offset)     { set_offset ( ( offset() + add_offset ) ); }
+
+  void verify();
+  void print ();
+
+  // unit test stuff
+  static void test() {}
+
+ private:
+  inline friend NativeMovRegMem* nativeMovRegMem_at (address address);
+};
+
+inline NativeMovRegMem* nativeMovRegMem_at (address address) {
+  NativeMovRegMem* test = (NativeMovRegMem*) address;
+#ifdef ASSERT
+  test->verify();
+#endif
+  return test;
+}
+
+class NativeMovRegMemPatching: public NativeMovRegMem {
+ private:
+  friend NativeMovRegMemPatching* nativeMovRegMemPatching_at (address address) {Unimplemented(); return 0;  }
+};
+
+class NativeJump: public NativeInstruction {
+ public:
+  enum {
+    instruction_size = NativeMovConstReg::movw_movt_pair_sz + NativeBranchType::instruction_size,
+  };
+  address instruction_address() const {
+    return addr();
+  }
+
+  address next_instruction_address() const;
+
+  address jump_destination() const;
+  void set_jump_destination(address dest);
+
+  // Creation
+  inline friend NativeJump* nativeJump_at(address address);
+
+  void verify();
+
+  // Unit testing stuff
+  static void test() {}
+
+  // Insertion of native jump instruction
+  static void insert(address code_pos, address entry);
+  // MT-safe insertion of native jump at verified method entry
+  static void check_verified_entry_alignment(address entry, address verified_entry);
+  static void patch_verified_entry(address entry, address verified_entry, address dest);
+
+  static bool is_at(address instr);
+  static NativeJump* from(address instr);
+};
+
+inline NativeJump* nativeJump_at(address addr) {
+  return NativeJump::from(addr);
+}
+
+// TODO We don't really need NativeGeneralJump, NativeJump should be able to do
+// everything that General Jump would.  Make this only interface to NativeJump
+// from share code (c1_Runtime)
+class NativeGeneralJump: public NativeJump {
+public:
+  enum {
+    instruction_size = arm_insn_sz,
+  };
+
+  static void insert_unconditional(address code_pos, address entry);
+  static void replace_mt_safe(address instr_addr, address code_buffer);
+  static void verify();
+};
+
+inline NativeGeneralJump* nativeGeneralJump_at(address address) {
+  NativeGeneralJump* jump = (NativeGeneralJump*)(address);
+  debug_only(jump->verify();)
+  return jump;
+}
+
+class NativePopReg : public NativeInstruction {
+ public:
+  // Insert a pop instruction
+  static void insert(address code_pos, Register reg);
+};
+
+
+class NativeIllegalInstruction: public NativeInstruction {
+ public:
+  // Insert illegal opcode as specific address
+  static void insert(address code_pos);
+};
+
+// return instruction that does not pop values of the stack
+class NativeReturn: public NativeInstruction {
+ public:
+};
+
+// return instruction that does pop values of the stack
+class NativeReturnX: public NativeInstruction {
+ public:
+};
+
+// Simple test vs memory
+class NativeTstRegMem: public NativeInstruction {
+ public:
+};
+
+inline bool NativeInstruction::is_nop()         {
+  return (as_uint() & 0x0fffffff) == 0x0320f000;
+}
+
+inline bool NativeInstruction::is_barrer()         {
+  return (as_uint() == 0xf57ff05b /* dmb ish */ ||
+            as_uint() == 0xee070fba /* mcr 15, 0, r0, cr7, cr10, {5}) */);
+}
+
+inline bool NativeInstruction::is_jump_or_nop() {
+  return is_nop() || is_jump();
+}
+
+class NativeImmCall: public NativeBranchType {
+ public:
+  address destination() const;
+  void set_destination(address dest);
+
+  static bool is_at(address address);
+  static NativeImmCall* from(address address);
+};
+
+class NativeImmJump: public NativeBranchType {
+ public:
+
+  address destination() const;
+  void set_destination(address r);
+
+  static bool is_at(address address);
+  static NativeImmJump* from(address address);
+};
+
+class NativeRegJump: public NativeBranchType {
+ public:
+
+  Register destination() const;
+  void set_destination(Register r);
+
+  static bool is_at(address address);
+  static NativeRegJump* from(address address);
+};
+
+inline bool NativeInstruction::is_call() const          { return NativeCall::is_at(addr()); }
+inline bool NativeInstruction::is_jump() const          { return NativeJump::is_at(addr()); }
+inline bool NativeInstruction::is_mov_const_reg() const { return NativeMovConstReg::is_at(addr()); }
+inline bool NativeInstruction::is_imm_call() const      { return NativeImmCall::is_at(addr()); }
+inline bool NativeInstruction::is_reg_call() const      { return NativeRegCall::is_at(addr()); }
+inline bool NativeInstruction::is_imm_jump() const      { return NativeImmJump::is_at(addr()); }
+inline bool NativeInstruction::is_reg_jump() const      { return NativeRegJump::is_at(addr()); }
+
+inline NativeCall* nativeCall_before(address return_address) {
+  if (NativeTrampolineCall::is_at(return_address - NativeCall::instruction_size)) {
+    return NativeCall::from(return_address - NativeCall::instruction_size);
+  }
+  if (NativeMovConstReg::is_at(return_address - NativeCall::instruction_size)) {
+    NativeMovConstReg *nm = NativeMovConstReg::from(return_address - NativeCall::instruction_size);
+    address next_instr = nm->next_instruction_address();
+    if (NativeRegCall::is_at(next_instr) &&
+            NativeRegCall::from(next_instr)->destination() == nm->destination()) {
+      return NativeCall::from(return_address - NativeCall::instruction_size);
+    }
+  }
+  if (NativeImmCall::is_at(return_address - NativeBranchType::instruction_size)) {
+    return NativeCall::from(return_address - NativeBranchType::instruction_size);
+  }
+
+  ShouldNotReachHere();
+  return NULL;
+}
+
+#endif // CPU_AARCH32_VM_NATIVEINST_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:17.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/registerMap_aarch32.hpp	2018-09-25 19:25:17.000000000 +0300
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 1998, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_REGISTERMAP_AARCH32_HPP
+#define CPU_AARCH32_VM_REGISTERMAP_AARCH32_HPP
+
+ private:
+  // This is a hook for finding a register in a "well-known" location, such as
+  // a register block of a predetermined format. Since there is none, we just
+  // return NULL. See registerMap_sparc.hpp for an example of grabbing
+  // registers from register save areas of a standard layout.
+  address pd_location(VMReg reg) const {
+    return NULL;
+  }
+
+  // No platform dependent state to clear, initialize, or copy
+  void pd_clear() {}
+  void pd_initialize() {}
+  void pd_initialize_from(const RegisterMap* map) {}
+
+#endif // CPU_AARCH32_VM_REGISTERMAP_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:18.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/register_aarch32.cpp	2018-09-25 19:25:18.000000000 +0300
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/register.hpp"
+
+const int ConcreteRegisterImpl::max_gpr = RegisterImpl::number_of_registers;
+const int ConcreteRegisterImpl::max_fpr = ConcreteRegisterImpl::max_gpr +
+                                          FloatRegisterImpl::number_of_registers;
+
+const char* RegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7",
+    "r8", "r9", "r10", "r11", "r12", "r13", "r14", "r15"
+  };
+  return is_valid() ? names[encoding()] : "noreg";
+}
+
+const char* FloatRegisterImpl::name() const {
+  const char* names[number_of_registers] = {
+    "s0", "s1", "s2", "s3", "s4", "s5", "s6", "s7",
+    "s8", "s9", "s10", "s11", "s12", "s13", "s14", "s15",
+    "s16", "s17", "s18", "s19", "s20", "s21", "s22", "s23",
+    "s24", "s25", "s26", "s27", "s28", "s29", "s30", "s31"
+  };
+  return is_valid() ? names[encoding()] : "fnoreg";
+}
--- /dev/null	2018-09-25 19:25:19.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/register_aarch32.hpp	2018-09-25 19:25:19.000000000 +0300
@@ -0,0 +1,483 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_REGISTER_AARCH32_HPP
+#define CPU_AARCH32_VM_REGISTER_AARCH32_HPP
+
+#include "asm/register.hpp"
+
+class VMRegImpl;
+typedef VMRegImpl* VMReg;
+
+// Implementation of integer registers for AArch32 architecture
+
+class RegisterImpl;
+typedef RegisterImpl* Register;
+
+inline Register as_Register(int encoding) {
+  return (Register)(intptr_t) encoding;
+}
+
+class RegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+    number_of_registers = 16
+  };
+
+  // Construction
+  inline friend Register as_Register(int encoding);
+
+  // Accessors
+  int encoding() const {
+    assert(is_valid(), "invalid register");
+    return (intptr_t) this;
+  }
+  int encoding_nocheck() const {
+    return (intptr_t) this;
+  }
+  VMReg as_VMReg();
+  Register successor() const {
+    return as_Register(encoding() + 1);
+  }
+
+  // Testers
+  bool is_valid() const {
+    return 0 <= (intptr_t) this && (intptr_t) this < number_of_registers;
+  }
+
+  // Return the bit which represents this register. This is intended to be
+  // used in bitmasks. See RegSet class below.
+  unsigned long bit(bool should_set = true) const {
+    return should_set ? 1 << encoding() : 0;
+  }
+
+  // Return the name of this register
+  const char* name() const;
+};
+
+// Integer registers of AArch32 architecture
+#define R(r)   ((Register)(r))
+
+#define Rmh_SP_save     r11  // for C2
+
+CONSTANT_REGISTER_DECLARATION(Register, noreg, -1);
+
+CONSTANT_REGISTER_DECLARATION(Register, r0,     0);
+CONSTANT_REGISTER_DECLARATION(Register, r1,     1);
+CONSTANT_REGISTER_DECLARATION(Register, r2,     2);
+CONSTANT_REGISTER_DECLARATION(Register, r3,     3);
+CONSTANT_REGISTER_DECLARATION(Register, r4,     4);
+CONSTANT_REGISTER_DECLARATION(Register, r5,     5);
+CONSTANT_REGISTER_DECLARATION(Register, r6,     6);
+CONSTANT_REGISTER_DECLARATION(Register, r7,     7);
+CONSTANT_REGISTER_DECLARATION(Register, r8,     8);
+CONSTANT_REGISTER_DECLARATION(Register, r9,     9);
+CONSTANT_REGISTER_DECLARATION(Register, r10,   10);
+CONSTANT_REGISTER_DECLARATION(Register, r11,   11);
+CONSTANT_REGISTER_DECLARATION(Register, r12,   12);
+CONSTANT_REGISTER_DECLARATION(Register, r13,   13);
+CONSTANT_REGISTER_DECLARATION(Register, r14,   14);
+CONSTANT_REGISTER_DECLARATION(Register, r15,   15);
+
+// Implementation of floating point registers for AArch32 (VFPv3-D16)
+// architecture
+
+class FloatRegisterImpl;
+typedef FloatRegisterImpl* FloatRegister;
+
+// Return FloatRegister corresponding to the given s-type (aka f-type in this
+// port) register number
+inline FloatRegister as_FloatRegister(int encoding) {
+  return (FloatRegister)(intptr_t) encoding;
+}
+
+// Return FloatRegister corresponding to the given d-type register number
+inline FloatRegister as_DoubleFloatRegister(int encoding) {
+  return as_FloatRegister(2 * encoding);
+}
+
+class FloatRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+    // VFPv3-D16 architecture includes 16 doubleword registers, which can be
+    // also observed as 32 singleword registers. We count the singleword
+    // registers here.
+    number_of_registers = 32
+  };
+
+  enum FloatRegisterSize {
+    SINGLE = 1,
+    DOUBLE = 2,
+    QUAD = 4
+  };
+
+  // Construction
+  inline friend FloatRegister as_FloatRegister(int encoding);
+  inline friend FloatRegister as_DoubleFloatRegister(int encoding);
+
+  // Accessors
+  int encoding() const {
+    assert(is_valid(), "invalid register");
+    return (intptr_t) this;
+  }
+  int encoding_nocheck() const {
+    return (intptr_t) this;
+  }
+  VMReg as_VMReg();
+  FloatRegister successor(enum FloatRegisterSize size) const {
+    return (as_FloatRegister((encoding() + (int)size) % number_of_registers |
+            (encoding() + (int)size) / number_of_registers));
+  }
+
+  // Testers
+  bool is_valid() const {
+    return 0 <= (intptr_t) this && (intptr_t) this < number_of_registers;
+  }
+
+  // Return the bit which represents this register. This is intended to be
+  // used in bitmasks. See FloatRegSet class below.
+  unsigned long bit(bool should_set = true) const {
+    return should_set ? 1 << encoding() : 0;
+  }
+
+  // Return the name of this register
+  const char* name() const;
+};
+
+// Floating point registers of AArch32 (VFPv3-D16, D32 and SIMD) architecture
+
+// Only the first 8 doubleword registers can be used for parameter passing
+// and thus are caller-saved. The rest 8 registers are callee-saved.
+// In VFPv3-D32 there are additional 16 doubleword registers that are
+// caller-saved again.
+
+// Here we introduce the symbolic names for doubleword registers and the
+// corresponding singleword views for the first 16 of them. The instruction
+// set allows us to encode the doubleword register numbers directly using
+// the constants below.
+
+// The respective names are as well defined for quad-word registers with
+// encoding set by the same principles.
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, fnoreg, -1);
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d0,      0);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d1,      2);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d2,      4);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d3,      6);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d4,      8);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d5,     10);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d6,     12);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d7,     14);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d8,     16);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d9,     18);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d10,    20);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d11,    22);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d12,    24);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d13,    26);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d14,    28);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d15,    30);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d16,     1);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d17,     3);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d18,     5);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d19,     7);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d20,     9);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d21,    11);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d22,    13);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d23,    15);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d24,    17);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d25,    19);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d26,    21);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d27,    23);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d28,    25);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d29,    27);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d30,    29);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, d31,    31);
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q0,      0);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q1,      4);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q2,      8);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q3,     12);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q4,     16);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q5,     20);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q6,     24);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q7,     28);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q8,      1);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q9,      5);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q10,     9);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q11,    13);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q12,    17);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q13,    21);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q14,    25);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, q15,    29);
+
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f0,      0);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f1,      1);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f2,      2);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f3,      3);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f4,      4);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f5,      5);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f6,      6);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f7,      7);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f8,      8);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f9,      9);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f10,    10);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f11,    11);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f12,    12);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f13,    13);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f14,    14);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f15,    15);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f16,    16);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f17,    17);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f18,    18);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f19,    19);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f20,    20);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f21,    21);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f22,    22);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f23,    23);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f24,    24);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f25,    25);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f26,    26);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f27,    27);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f28,    28);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f29,    29);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f30,    30);
+CONSTANT_REGISTER_DECLARATION(FloatRegister, f31,    31);
+
+// Set of singleword floating point registers
+
+class FloatRegSet {
+ private:
+  uint32_t _bitset;
+
+  FloatRegSet(uint32_t bitset) : _bitset(bitset) { }
+
+ public:
+  FloatRegSet() : _bitset(0) { }
+
+  FloatRegSet(FloatRegister r1) : _bitset(r1->bit()) { }
+
+  FloatRegSet operator+(const FloatRegSet aSet) const {
+    FloatRegSet result(_bitset | aSet._bitset);
+    return result;
+  }
+
+  FloatRegSet operator-(const FloatRegSet aSet) const {
+    FloatRegSet result(_bitset & ~aSet._bitset);
+    return result;
+  }
+
+  FloatRegSet& operator+=(const FloatRegSet aSet) {
+    *this = *this + aSet;
+    return *this;
+  }
+
+  static FloatRegSet of(FloatRegister r1) {
+    return FloatRegSet(r1);
+  }
+
+  static FloatRegSet of(FloatRegister r1, FloatRegister r2) {
+    return of(r1) + r2;
+  }
+
+  static FloatRegSet of(FloatRegister r1, FloatRegister r2, FloatRegister r3) {
+    return of(r1, r2) + r3;
+  }
+
+  static FloatRegSet of(FloatRegister r1, FloatRegister r2, FloatRegister r3,
+                        FloatRegister r4) {
+    return of(r1, r2, r3) + r4;
+  }
+
+  static FloatRegSet range(FloatRegister start, FloatRegister end) {
+    uint32_t bits = ~0;
+    bits <<= start->encoding();
+    bits <<= 31 - end->encoding();
+    bits >>= 31 - end->encoding();
+    return FloatRegSet(bits);
+  }
+
+  uint32_t bits() const {
+    return _bitset;
+  }
+};
+
+// Set of doubleword floating point registers
+
+class DoubleFloatRegSet {
+ private:
+  uint32_t _bitset;
+
+  DoubleFloatRegSet(uint32_t bitset) : _bitset(bitset) { }
+
+ public:
+  DoubleFloatRegSet() : _bitset(0) { }
+
+  DoubleFloatRegSet(FloatRegister r1) : _bitset(1 << (r1->encoding()>>1)+((r1->encoding()%2)?16:0)) { }
+
+  DoubleFloatRegSet operator+(const DoubleFloatRegSet aSet) const {
+    DoubleFloatRegSet result(_bitset | aSet._bitset);
+    return result;
+  }
+
+  DoubleFloatRegSet operator-(const DoubleFloatRegSet aSet) const {
+    DoubleFloatRegSet result(_bitset & ~aSet._bitset);
+    return result;
+  }
+
+  DoubleFloatRegSet& operator+=(const DoubleFloatRegSet aSet) {
+    *this = *this + aSet;
+    return *this;
+  }
+
+  static DoubleFloatRegSet of(FloatRegister r1) {
+    return DoubleFloatRegSet(r1);
+  }
+
+  static DoubleFloatRegSet of(FloatRegister r1, FloatRegister r2) {
+    return of(r1) + r2;
+  }
+
+  static DoubleFloatRegSet of(FloatRegister r1, FloatRegister r2,
+                              FloatRegister r3) {
+    return of(r1, r2) + r3;
+  }
+
+  static DoubleFloatRegSet of(FloatRegister r1, FloatRegister r2,
+                              FloatRegister r3, FloatRegister r4) {
+    return of(r1, r2, r3) + r4;
+  }
+
+  static DoubleFloatRegSet range(FloatRegister start, FloatRegister end) {
+    int start_reg = (start->encoding() >> 1)+((start->encoding()%2)?16:0);
+    int end_reg = (end->encoding() >> 1)+((end->encoding()%2)?16:0);
+    uint32_t bits = ~0;
+    bits <<= start_reg;
+    bits <<= 31 - end_reg;
+    bits >>= 31 - end_reg;
+    return DoubleFloatRegSet(bits);
+  }
+
+  uint32_t bits() const {
+    return _bitset;
+  }
+};
+
+// Total number of registers of all sorts
+
+class ConcreteRegisterImpl : public AbstractRegisterImpl {
+ public:
+  enum {
+    // Here we count the total number of 32-bit slots available in registers.
+    // This number must be large enough to cover REG_COUNT (defined by C2)
+    // registers. There is no requirement that any ordering here matches
+    // any ordering C2 gives its OptoReg's.
+    // C2 port is made to be able to operate on all 32 double registers of VFPD32
+    // but the register count is 32 since high half of 32-bit regs are not addressable
+    // so need to double the amount of known registers to get expected 64
+#ifndef COMPILER2
+    number_of_registers = RegisterImpl::number_of_registers +
+                          FloatRegisterImpl::number_of_registers
+#else
+    number_of_registers = RegisterImpl::number_of_registers +
+                          (FloatRegisterImpl::number_of_registers*2) +
+                          2
+#endif
+  };
+
+  static const int max_gpr;
+  static const int max_fpr;
+};
+
+
+// Set of integer registers
+
+class RegSet {
+ private:
+  uint32_t _bitset;
+
+  RegSet(uint32_t bitset) : _bitset(bitset) { }
+
+ public:
+  RegSet() : _bitset(0) { }
+
+  RegSet(Register r1) : _bitset(r1->bit()) { }
+
+  RegSet operator+(const RegSet aSet) const {
+    RegSet result(_bitset | aSet._bitset);
+    return result;
+  }
+
+  RegSet operator-(const RegSet aSet) const {
+    RegSet result(_bitset & ~aSet._bitset);
+    return result;
+  }
+
+  RegSet& operator+=(const RegSet aSet) {
+    *this = *this + aSet;
+    return *this;
+  }
+
+  static RegSet of(Register r1) {
+    return RegSet(r1);
+  }
+
+  static RegSet of(Register r1, Register r2) {
+    return of(r1) + r2;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3) {
+    return of(r1, r2) + r3;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3, Register r4) {
+    return of(r1, r2, r3) + r4;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3, Register r4, Register r5) {
+    return of(r1, r2, r3, r4) + r5;
+  }
+
+  static RegSet of(Register r1, Register r2, Register r3, Register r4, Register r5, Register r6) {
+    return of(r1, r2, r3, r4, r5) + r6;
+  }
+
+  static RegSet range(Register start, Register end) {
+    uint32_t bits = ~0;
+    bits <<= start->encoding();
+    bits <<= 31 - end->encoding();
+    bits >>= 31 - end->encoding();
+    return RegSet(bits);
+  }
+
+  uint32_t bits() const {
+    return _bitset;
+  }
+};
+
+
+#endif // CPU_AARCH32_VM_REGISTER_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:20.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/register_definitions_aarch32.cpp	2018-09-25 19:25:20.000000000 +0300
@@ -0,0 +1,159 @@
+/*
+ * Copyright (c) 2002, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/assembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "asm/register.hpp"
+
+REGISTER_DEFINITION(Register, noreg);
+
+REGISTER_DEFINITION(Register, r0);
+REGISTER_DEFINITION(Register, r1);
+REGISTER_DEFINITION(Register, r2);
+REGISTER_DEFINITION(Register, r3);
+REGISTER_DEFINITION(Register, r4);
+REGISTER_DEFINITION(Register, r5);
+REGISTER_DEFINITION(Register, r6);
+REGISTER_DEFINITION(Register, r7);
+REGISTER_DEFINITION(Register, r8);
+REGISTER_DEFINITION(Register, r9);
+REGISTER_DEFINITION(Register, r10);
+REGISTER_DEFINITION(Register, r11);
+REGISTER_DEFINITION(Register, r12);
+REGISTER_DEFINITION(Register, r13);
+REGISTER_DEFINITION(Register, r14);
+REGISTER_DEFINITION(Register, r15);
+
+REGISTER_DEFINITION(FloatRegister, fnoreg);
+
+REGISTER_DEFINITION(FloatRegister, d0);
+REGISTER_DEFINITION(FloatRegister, d1);
+REGISTER_DEFINITION(FloatRegister, d2);
+REGISTER_DEFINITION(FloatRegister, d3);
+REGISTER_DEFINITION(FloatRegister, d4);
+REGISTER_DEFINITION(FloatRegister, d5);
+REGISTER_DEFINITION(FloatRegister, d6);
+REGISTER_DEFINITION(FloatRegister, d7);
+REGISTER_DEFINITION(FloatRegister, d8);
+REGISTER_DEFINITION(FloatRegister, d9);
+REGISTER_DEFINITION(FloatRegister, d10);
+REGISTER_DEFINITION(FloatRegister, d11);
+REGISTER_DEFINITION(FloatRegister, d12);
+REGISTER_DEFINITION(FloatRegister, d13);
+REGISTER_DEFINITION(FloatRegister, d14);
+REGISTER_DEFINITION(FloatRegister, d15);
+REGISTER_DEFINITION(FloatRegister, d16);
+REGISTER_DEFINITION(FloatRegister, d17);
+REGISTER_DEFINITION(FloatRegister, d18);
+REGISTER_DEFINITION(FloatRegister, d19);
+REGISTER_DEFINITION(FloatRegister, d20);
+REGISTER_DEFINITION(FloatRegister, d21);
+REGISTER_DEFINITION(FloatRegister, d22);
+REGISTER_DEFINITION(FloatRegister, d23);
+REGISTER_DEFINITION(FloatRegister, d24);
+REGISTER_DEFINITION(FloatRegister, d25);
+REGISTER_DEFINITION(FloatRegister, d26);
+REGISTER_DEFINITION(FloatRegister, d27);
+REGISTER_DEFINITION(FloatRegister, d28);
+REGISTER_DEFINITION(FloatRegister, d29);
+REGISTER_DEFINITION(FloatRegister, d30);
+REGISTER_DEFINITION(FloatRegister, d31);
+
+REGISTER_DEFINITION(FloatRegister, q0);
+REGISTER_DEFINITION(FloatRegister, q1);
+REGISTER_DEFINITION(FloatRegister, q2);
+REGISTER_DEFINITION(FloatRegister, q3);
+REGISTER_DEFINITION(FloatRegister, q4);
+REGISTER_DEFINITION(FloatRegister, q5);
+REGISTER_DEFINITION(FloatRegister, q6);
+REGISTER_DEFINITION(FloatRegister, q7);
+REGISTER_DEFINITION(FloatRegister, q8);
+REGISTER_DEFINITION(FloatRegister, q9);
+REGISTER_DEFINITION(FloatRegister, q10);
+REGISTER_DEFINITION(FloatRegister, q11);
+REGISTER_DEFINITION(FloatRegister, q12);
+REGISTER_DEFINITION(FloatRegister, q13);
+REGISTER_DEFINITION(FloatRegister, q14);
+REGISTER_DEFINITION(FloatRegister, q15);
+
+REGISTER_DEFINITION(FloatRegister, f0);
+REGISTER_DEFINITION(FloatRegister, f1);
+REGISTER_DEFINITION(FloatRegister, f2);
+REGISTER_DEFINITION(FloatRegister, f3);
+REGISTER_DEFINITION(FloatRegister, f4);
+REGISTER_DEFINITION(FloatRegister, f5);
+REGISTER_DEFINITION(FloatRegister, f6);
+REGISTER_DEFINITION(FloatRegister, f7);
+REGISTER_DEFINITION(FloatRegister, f8);
+REGISTER_DEFINITION(FloatRegister, f9);
+REGISTER_DEFINITION(FloatRegister, f10);
+REGISTER_DEFINITION(FloatRegister, f11);
+REGISTER_DEFINITION(FloatRegister, f12);
+REGISTER_DEFINITION(FloatRegister, f13);
+REGISTER_DEFINITION(FloatRegister, f14);
+REGISTER_DEFINITION(FloatRegister, f15);
+REGISTER_DEFINITION(FloatRegister, f16);
+REGISTER_DEFINITION(FloatRegister, f17);
+REGISTER_DEFINITION(FloatRegister, f18);
+REGISTER_DEFINITION(FloatRegister, f19);
+REGISTER_DEFINITION(FloatRegister, f20);
+REGISTER_DEFINITION(FloatRegister, f21);
+REGISTER_DEFINITION(FloatRegister, f22);
+REGISTER_DEFINITION(FloatRegister, f23);
+REGISTER_DEFINITION(FloatRegister, f24);
+REGISTER_DEFINITION(FloatRegister, f25);
+REGISTER_DEFINITION(FloatRegister, f26);
+REGISTER_DEFINITION(FloatRegister, f27);
+REGISTER_DEFINITION(FloatRegister, f28);
+REGISTER_DEFINITION(FloatRegister, f29);
+REGISTER_DEFINITION(FloatRegister, f30);
+REGISTER_DEFINITION(FloatRegister, f31);
+
+
+REGISTER_DEFINITION(Register, c_rarg0);
+REGISTER_DEFINITION(Register, c_rarg1);
+REGISTER_DEFINITION(Register, c_rarg2);
+REGISTER_DEFINITION(Register, c_rarg3);
+
+REGISTER_DEFINITION(Register, j_rarg0);
+REGISTER_DEFINITION(Register, j_rarg1);
+REGISTER_DEFINITION(Register, j_rarg2);
+REGISTER_DEFINITION(Register, j_rarg3);
+
+REGISTER_DEFINITION(Register, rdispatch);
+REGISTER_DEFINITION(Register, rbcp);
+REGISTER_DEFINITION(Register, rlocals);
+REGISTER_DEFINITION(Register, rcpool);
+REGISTER_DEFINITION(Register, rthread);
+REGISTER_DEFINITION(Register, rscratch1);
+REGISTER_DEFINITION(Register, rmethod);
+REGISTER_DEFINITION(Register, rfp);
+REGISTER_DEFINITION(Register, rscratch2);
+REGISTER_DEFINITION(Register, sp);
+REGISTER_DEFINITION(Register, lr);
+REGISTER_DEFINITION(Register, r15_pc);
--- /dev/null	2018-09-25 19:25:21.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/relocInfo_aarch32.cpp	2018-09-25 19:25:21.000000000 +0300
@@ -0,0 +1,144 @@
+/*
+ * Copyright (c) 1998, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "code/relocInfo.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/oop.inline.hpp"
+#include "runtime/safepoint.hpp"
+
+void Relocation::pd_set_data_value(address x, intptr_t o, bool verify_only) {
+
+  if (NativeFarLdr::is_at(addr())) {
+    NativeFarLdr *nal = NativeFarLdr::from(addr());
+    address const_addr = NULL;
+    switch(type()) {
+    case relocInfo::oop_type:
+      const_addr = (address)code()->oop_addr_at(((oop_Relocation *)this)->oop_index());
+      assert(*(address*)const_addr == x, "error in memory relocation");
+      break;
+    case relocInfo::section_word_type:
+      const_addr = ((section_word_Relocation*)this)->target();
+      assert(const_addr == x, "error in memory relocation");
+      break;
+    default:
+      ShouldNotReachHere();
+    }
+    assert(const_addr, "should not be NULL");
+    if (verify_only) {
+      guarantee(nal->data_addr() == (intptr_t*) const_addr, "instructions must match");
+      return;
+    }
+    nal->set_data_addr((intptr_t*) const_addr);
+  } else {
+    NativeMovConstReg *nm = NativeMovConstReg::from(addr());
+    if (verify_only) {
+      guarantee(nm->data() == (intptr_t) x, "instructions must match");
+      return;
+    }
+    nm->set_data((intptr_t) x);
+  }
+}
+
+address Relocation::pd_call_destination(address orig_addr) {
+  intptr_t adj = 0;
+  if (orig_addr != NULL) {
+    // We just moved this call instruction from orig_addr to addr().
+    // This means its target will appear to have grown by addr() - orig_addr.
+    adj = -( addr() - orig_addr );
+  }
+
+  NativeInstruction *ni = NativeInstruction::from(addr());
+
+  // Checking from shortest encoding size to longets,
+  // to avoid access beyond CodeCache boundary
+  if (NativeImmCall::is_at(addr())) {
+    return NativeImmCall::from(addr())->destination() + adj;
+  } else if (NativeImmJump::is_at(addr())) {
+    return NativeImmJump::from(addr())->destination() + adj;
+  } else if (NativeCall::is_at(addr())) {
+    return NativeCall::from(addr())->destination();
+  } else if (NativeJump::is_at(addr())) {
+    return NativeJump::from(addr())->jump_destination();
+  }
+
+  ShouldNotReachHere();
+  return NULL;
+}
+
+void Relocation::pd_set_call_destination(address x) {
+  assert(addr() != x, "call instruction in an infinite loop"); // FIXME what's wrong to _generate_ loop?
+  NativeInstruction *ni = NativeInstruction::from(addr());
+
+  // Checking from shortest encoding size to longets,
+  // to avoid access beyond CodeCache boundary
+  if (NativeImmCall::is_at(addr())) {
+    NativeImmCall::from(addr())->set_destination(x);
+  } else if (NativeImmJump::is_at(addr())) {
+    NativeImmJump::from(addr())->set_destination(x);
+  } else if (NativeCall::is_at(addr())) {
+    NativeCall::from(addr())->set_destination(x);
+  } else if (NativeJump::is_at(addr())) {
+    NativeJump::from(addr())->set_jump_destination(x);
+  } else {
+    ShouldNotReachHere();
+  }
+
+  assert(pd_call_destination(addr()) == x, "fail in reloc");
+}
+
+address* Relocation::pd_address_in_code() {
+  ShouldNotCallThis();
+  return NULL;
+}
+
+address Relocation::pd_get_address_from_code() {
+  ShouldNotCallThis();
+  return NULL;
+}
+
+void poll_Relocation::fix_relocation_after_move(const CodeBuffer* src, CodeBuffer* dest) {
+  NativeInstruction *ni = NativeInstruction::from(addr());
+  if (ni->is_mov_const_reg()) {
+    address old_addr = old_addr_for(addr(), src, dest);
+    NativeMovConstReg *nm2 = NativeMovConstReg::from(old_addr);
+    NativeMovConstReg::from(addr())->set_data(nm2->data());
+  } else {
+#if 0
+    warning("TODO: poll_Relocation::fix_relocation_after_move: "
+      "ensure relocating does nothing on relative instruction");
+#endif
+  }
+}
+
+void metadata_Relocation::pd_fix_value(address x) {
+  if (NativeFarLdr::is_at(addr())) {
+    NativeFarLdr *nal = NativeFarLdr::from(addr());
+    address const_addr = (address)code()->metadata_addr_at(((metadata_Relocation *)this)->metadata_index());
+    nal->set_data_addr((intptr_t*) const_addr);
+  }
+}
--- /dev/null	2018-09-25 19:25:22.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/relocInfo_aarch32.hpp	2018-09-25 19:25:22.000000000 +0300
@@ -0,0 +1,45 @@
+/*
+ * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_RELOCINFO_AARCH32_HPP
+#define CPU_AARCH32_VM_RELOCINFO_AARCH32_HPP
+
+  // machine-dependent parts of class relocInfo
+ private:
+  enum {
+    // Relocations are byte-aligned.
+    offset_unit        =  1,
+    // We don't use format().
+    format_width       =  0
+  };
+
+ public:
+
+  // This platform has no oops in the code that are not also
+  // listed in the oop section.
+  static bool mustIterateImmediateOopsInCode() { return false; }
+
+#endif // CPU_AARCH32_VM_RELOCINFO_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:23.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/runtime_aarch32.cpp	2018-09-25 19:25:23.000000000 +0300
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#ifdef COMPILER2
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "code/vmreg.hpp"
+#include "interpreter/interpreter.hpp"
+#include "opto/runtime.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/globalDefinitions.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#endif
+
+
--- /dev/null	2018-09-25 19:25:24.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/sharedRuntime_aarch32.cpp	2018-09-25 19:25:24.000000000 +0300
@@ -0,0 +1,3128 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "code/debugInfoRec.hpp"
+#include "code/icBuffer.hpp"
+#include "code/vtableStubs.hpp"
+#include "interp_masm_aarch32.hpp"
+#include "interpreter/interpreter.hpp"
+#include "logging/log.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "runtime/safepointMechanism.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/align.hpp"
+#include "utilities/formatBuffer.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#include "register_aarch32.hpp"
+#include "vm_version_aarch32.hpp"
+#ifdef COMPILER1
+#include "c1/c1_Runtime1.hpp"
+#endif
+#if COMPILER2_OR_JVMCI
+#include "adfiles/ad_aarch32.hpp"
+#include "opto/runtime.hpp"
+#endif
+#if INCLUDE_JVMCI
+#include "jvmci/jvmciJavaClasses.hpp"
+#endif
+
+
+#define __ masm->
+
+const int StackAlignmentInSlots = StackAlignmentInBytes / VMRegImpl::stack_slot_size;
+
+class SimpleRuntimeFrame {
+
+  public:
+
+  // Most of the runtime stubs have this simple frame layout.
+  // This class exists to make the layout shared in one place.
+  // Offsets are for compiler stack slots, which are jints.
+  enum layout {
+    // The frame sender code expects that rbp will be in the "natural" place and
+    // will override any oopMap setting for it. We must therefore force the layout
+    // so that it agrees with the frame sender code.
+    // we don't expect any arg reg save area so aarch32 asserts that
+    // frame::arg_reg_save_area_bytes == 0
+    rbp_off = 0,
+    rbp_off2,
+    return_off, return_off2,
+    framesize
+  };
+};
+
+// FIXME -- this is used by C1
+class RegisterSaver {
+ public:
+  static OopMap* save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool lr_pushed = false);
+  static void restore_live_registers(MacroAssembler* masm, bool restore_lr = true);
+
+  // Capture info about frame layout
+  enum layout {
+      fpu_state_off = 0,
+      fpu_state_end = fpu_state_off+FPUStateSizeInWords-1,
+      // The frame sender code expects that rfp will be in
+      // the "natural" place and will override any oopMap
+      // setting for it. We must therefore force the layout
+      // so that it agrees with the frame sender code.
+      //
+      // FIXME there are extra saved register (from `push_CPU_state`) note that r11 == rfp
+      r0_off,
+      r1_off,
+      r2_off,
+      r3_off,
+      r4_off,
+      r5_off,
+      r6_off,
+      r7_off,
+      r8_off,  rmethod_off = r8_off,
+      r9_off,  rscratch1_off = r9_off,
+      r10_off,
+      r11_off,
+      r12_off,
+      r14_off, // with C2 can hold value different to LR entry in the frame
+      reg_save_size,
+  };
+
+
+  // Offsets into the register save area
+  // Used by deoptimization when it is managing result register
+  // values on its own
+
+  static int offset_in_bytes(int offset)    { return offset * wordSize; }
+
+// During deoptimization only the result registers need to be restored,
+  // all the other values have already been extracted.
+  static void restore_result_registers(MacroAssembler* masm);
+
+};
+
+OopMap* RegisterSaver::save_live_registers(MacroAssembler* masm, int additional_frame_words, int* total_frame_words, bool lr_pushed) {
+  int frame_size_in_bytes = additional_frame_words*wordSize + (reg_save_size + frame::get_frame_size()) *BytesPerInt;
+  int frame_size_in_slots = frame_size_in_bytes / BytesPerInt;
+  int additional_frame_slots = additional_frame_words*wordSize / BytesPerInt;
+  *total_frame_words = frame_size_in_bytes / wordSize;;
+
+  if (lr_pushed) {
+    if (FrameAPCS)
+      Unimplemented();
+
+    __ push(rfp);
+    __ add(rfp, sp, wordSize);
+  } else
+    __ enter();
+  __ push_CPU_state();
+
+  // Set an oopmap for the call site.  This oopmap will map all
+  // oop-registers and debug-info registers as callee-saved.  This
+  // will allow deoptimization at this safepoint to find all possible
+  // debug-info recordings, as well as let GC find all oops.
+
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* oop_map = new OopMap(frame_size_in_slots, 0);
+
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r0_off + additional_frame_slots), r0->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r1_off + additional_frame_slots), r1->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r2_off + additional_frame_slots), r2->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r3_off + additional_frame_slots), r3->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r4_off + additional_frame_slots), r4->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r5_off + additional_frame_slots), r5->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r6_off + additional_frame_slots), r6->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r7_off + additional_frame_slots), r7->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r8_off + additional_frame_slots), r8->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r9_off + additional_frame_slots), r9->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r10_off + additional_frame_slots), r10->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r12_off + additional_frame_slots), r12->as_VMReg());
+  oop_map->set_callee_saved(VMRegImpl::stack2reg(r14_off + additional_frame_slots), r14->as_VMReg());
+  if (hasFPU()) {
+    for (int i = 0; i < FPUStateSizeInWords; ++i) {
+      oop_map->set_callee_saved(VMRegImpl::stack2reg(fpu_state_off + i + additional_frame_slots),
+              as_FloatRegister(i)->as_VMReg());
+    }
+  }
+
+  return oop_map;
+}
+
+void RegisterSaver::restore_live_registers(MacroAssembler* masm, bool restore_lr) {
+  __ pop_CPU_state();
+  if (restore_lr)
+    __ leave();
+  else {
+    if (FrameAPCS)
+      Unimplemented();
+
+    __ sub(sp, rfp, wordSize);
+    __ pop(rfp);
+  }
+}
+
+void RegisterSaver::restore_result_registers(MacroAssembler* masm) {
+
+  // Just restore result register. Only used by deoptimization. By
+  // now any callee save register that needs to be restored to a c2
+  // caller of the deoptee has been extracted into the vframeArray
+  // and will be stuffed into the c2i adapter we create for later
+  // restoration so only result registers need to be restored here.
+
+
+  if(hasFPU()) {
+  // Restore fp result register
+  __ vldr_f64(d0, Address(sp, offset_in_bytes(fpu_state_off)));
+  }
+
+  // Restore integer result register
+  __ ldr(r0, Address(sp, offset_in_bytes(r0_off)));
+  __ ldr(r1, Address(sp, offset_in_bytes(r1_off)));
+
+  // Pop all of the register save are off the stack
+  __ add(sp, sp, (reg_save_size + frame::get_frame_size()) * wordSize);
+}
+
+// Is vector's size (in bytes) bigger than a size saved by default?
+// 16 bytes XMM registers are saved by default using fxsave/fxrstor instructions.
+bool SharedRuntime::is_wide_vector(int size) {
+  return size > 16;
+}
+
+size_t SharedRuntime::trampoline_size() {
+  return NativeCall::instruction_size;
+}
+
+void SharedRuntime::generate_trampoline(MacroAssembler *masm, address destination) {
+  __ mov(rscratch1, destination);
+  __ b(rscratch1);
+}
+// This functions returns offset from fp to java arguments on stack.
+//
+// The java_calling_convention describes stack locations as ideal slots on
+// a frame with no abi restrictions. Since we must observe abi restrictions
+// (like the placement of the register window) the slots must be biased by
+// the following value.
+static int reg2offset_in(VMReg r) {
+  // After stack frame created, fp points to 1 slot after previous sp value.
+  return (r->reg2stack() + 1) * VMRegImpl::stack_slot_size;
+}
+
+static int reg2offset_out(VMReg r) {
+  return (r->reg2stack() + SharedRuntime::out_preserve_stack_slots()) * VMRegImpl::stack_slot_size;
+}
+
+template <class T> static const T& min (const T& a, const T& b) {
+  return (a > b) ? b : a;
+}
+
+// ---------------------------------------------------------------------------
+// Read the array of BasicTypes from a signature, and compute where the
+// arguments should go.  Values in the VMRegPair regs array refer to 4-byte
+// quantities.  Values less than VMRegImpl::stack0 are registers, those above
+// refer to 4-byte stack slots.  All stack slots are based off of the stack pointer
+// as framesizes are fixed.
+// VMRegImpl::stack0 refers to the first slot 0(sp).
+// and VMRegImpl::stack0+1 refers to the memory word 4-byes higher.  Register
+// up to RegisterImpl::number_of_registers) are the 64-bit
+// integer registers.
+
+// Note: the INPUTS in sig_bt are in units of Java argument words,
+// which are 64-bit.  The OUTPUTS are in 32-bit units.
+
+int SharedRuntime::java_calling_convention(const BasicType *sig_bt,
+                                           VMRegPair *regs,
+                                           int total_args_passed,
+                                           int is_outgoing) {
+
+  assert(j_rarg0 == c_rarg0, "assumed");
+
+#ifndef HARD_FLOAT_CC
+  if (hasFPU()) {
+    // Create the mapping between argument positions and
+    // registers.
+    static const Register INT_ArgReg[Argument::n_int_register_parameters_j] = {
+      j_rarg0, j_rarg1, j_rarg2, j_rarg3
+    };
+    const uint FP_ArgReg_N = 16;
+    static const FloatRegister FP_ArgReg[] = {
+      f0, f1, f2, f3,
+      f4, f5, f6, f7,
+      f8, f9, f10, f11,
+      f12, f13, f14, f15,
+    };
+
+    uint int_args = 0;
+    uint fp_args = 0;
+    uint stk_args = 0;
+
+    for (int i = 0; i < total_args_passed; i++) {
+      switch (sig_bt[i]) {
+      case T_FLOAT:
+        if (fp_args < FP_ArgReg_N) {
+          regs[i].set1(FP_ArgReg[fp_args++]->as_VMReg());
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 1;
+        }
+        break;
+      case T_BOOLEAN:
+      case T_CHAR:
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+      case T_OBJECT:
+      case T_ARRAY:
+      case T_ADDRESS:
+        if (int_args < Argument::n_int_register_parameters_j) {
+          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 1;
+        }
+        break;
+      case T_VOID:
+        // halves of T_LONG or T_DOUBLE
+        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
+        regs[i].set_bad();
+        break;
+      case T_DOUBLE:
+        assert(sig_bt[i + 1] == T_VOID, "expecting half");
+        fp_args = align_up(fp_args, 2);
+        if (fp_args < FP_ArgReg_N) {
+          regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
+          fp_args += 2;
+        } else {
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+      case T_LONG:
+        assert(sig_bt[i + 1] == T_VOID, "expecting half");
+        if (int_args + 1 < Argument::n_int_register_parameters_j) {
+          if ((int_args % 2) != 0) {
+            ++int_args;
+          }
+          regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
+          int_args += 2;
+        } else {
+          if (stk_args % 2 != 0) {
+            ++stk_args;
+          }
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+          int_args = Argument::n_int_register_parameters_j;
+        }
+        break;
+      default:
+        ShouldNotReachHere();
+        break;
+      }
+    }
+
+    return align_up(stk_args, StackAlignmentInBytes/wordSize);
+  } else
+#endif // ndef HARD_FLOAT_CC
+  {
+    // in aarch32 pure soft-float mode the java calling convention is set the same as C one
+    return c_calling_convention(sig_bt, regs, NULL, total_args_passed);
+  }
+}
+
+// Patch the callers callsite with entry to compiled code if it exists.
+static void patch_callers_callsite(MacroAssembler *masm) {
+  Label L;
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::code_offset())));
+  __ cbz(rscratch1, L);
+
+  __ enter();
+  __ push_CPU_state();
+
+  // VM needs caller's callsite
+  // VM needs target method
+  // This needs to be a long call since we will relocate this adapter to
+  // the codeBuffer and it may not reach
+
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+
+  __ mov(c_rarg0, rmethod);
+  __ mov(c_rarg1, lr);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, SharedRuntime::fixup_callers_callsite)));
+  __ bl(rscratch1);
+  __ maybe_isb();
+
+  __ pop_CPU_state();
+  // restore sp
+  __ leave();
+  __ bind(L);
+}
+
+static void gen_c2i_adapter(MacroAssembler *masm,
+                            int total_args_passed,
+                            int comp_args_on_stack,
+                            const BasicType *sig_bt,
+                            const VMRegPair *regs,
+                            Label& skip_fixup) {
+  // Before we get into the guts of the C2I adapter, see if we should be here
+  // at all.  We've come from compiled code and are attempting to jump to the
+  // interpreter, which means the caller made a static call to get here
+  // (vcalls always get a compiled target if there is one).  Check for a
+  // compiled target.  If there is one, we need to patch the caller's call.
+  patch_callers_callsite(masm);
+
+  __ bind(skip_fixup);
+
+  // Since all args are passed on the stack, total_args_passed *
+  // Interpreter::stackElementSize is the space we need.
+
+  const int extraspace = total_args_passed * Interpreter::stackElementSize;
+  const Register compArgPos = lr;
+  int ld_shift = 0;
+
+  __ str(compArgPos, Address(sp, -(extraspace + wordSize)));
+  __ mov(compArgPos, sp);
+
+  // Now write the args into the outgoing interpreter space
+  for (int i = 0; i < total_args_passed; i++) {
+
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+
+    // next stack slot offset
+    const int next_off = -Interpreter::stackElementSize;
+
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (!r_1->is_valid()) {
+      assert(!r_2->is_valid(), "");
+      continue;
+    }
+
+    if (r_2->is_valid()) {
+      assert(i + 1 < total_args_passed && sig_bt[i + 1] == T_VOID, "going to overrwrite reg_2 value");
+    }
+
+    if (r_1->is_stack()) {
+      // memory to memory use rscratch1
+      int ld_off = r_1->reg2stack() * VMRegImpl::stack_slot_size - ld_shift;
+      if (!r_2->is_valid()) {
+        __ ldr(rscratch1, Address(compArgPos, ld_off));
+        __ str(rscratch1, Address(sp, next_off, Address::pre));
+      } else {
+          int tmp_off = ld_off;
+          // ldrd accepts only imm8
+          if(abs(ld_off) > (255 << 2)) {
+              if(__ is_valid_for_imm12(ld_off)) {
+                __ add(compArgPos, compArgPos, ld_off);
+              } else {
+                // add operates encoded imm12, NOT plain
+                __ mov(rscratch1, ld_off);
+                __ add(compArgPos, compArgPos, rscratch1);
+              }
+              tmp_off = 0;
+              ld_shift += ld_off;
+          }
+        __ ldrd(rscratch1, rscratch2, Address(compArgPos, tmp_off));
+        __ strd(rscratch1, rscratch2, Address(sp, 2* next_off, Address::pre));
+      }
+    } else if (r_1->is_Register()) {
+      Register r = r_1->as_Register();
+      assert(r != compArgPos, "compArgPos was modified");
+      if (!r_2->is_valid()) {
+        __ str(r, Address(sp, next_off, Address::pre));
+      } else {
+        assert(r_2->as_Register() != compArgPos, "compArgPos was modified");
+        __ strd(r, r_2->as_Register(), Address(sp, 2 * next_off, Address::pre));
+      }
+    } else {
+      assert(r_1->is_FloatRegister(), "");
+      if (!r_2->is_valid()) {
+        // Can't do pre or post addressing for vldr, vstr
+        __ add(sp, sp, next_off);
+        __ vstr_f32(r_1->as_FloatRegister(), Address(sp));
+      } else {
+    // TODO assert(r_2->is_FloatRegister() && r_2->as_FloatRegister() == r_1->as_FloatRegister() + 1, "");
+        // Can't do pre or post addressing for vldr, vstr
+        __ add(sp, sp, 2 * next_off);
+        __ vstr_f64(r_1->as_FloatRegister(), Address(sp));
+      }
+    }
+  }
+
+  // hope, sp is returned to desired value
+  __ ldr(compArgPos, Address(sp, -wordSize));
+
+  // set sender sp
+  if(__ is_valid_for_imm12(extraspace)) {
+    __ add(r4, sp, extraspace);
+  } else {
+    __ mov(rscratch1, extraspace);
+    __ add(r4, sp, rscratch1);
+  }
+
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::interpreter_entry_offset())));
+  __ b(rscratch1);
+}
+
+static void range_check(MacroAssembler* masm, Register pc_reg, Register temp_reg,
+                        address code_start, address code_end,
+                        Label& L_ok) {
+  Label L_fail;
+  __ lea(temp_reg, ExternalAddress(code_start));
+  __ cmp(pc_reg, temp_reg);
+  __ b(L_fail, Assembler::LO);
+  __ lea(temp_reg, ExternalAddress(code_end));
+  __ cmp(pc_reg, temp_reg);
+  __ b(L_ok, Assembler::LO);
+  __ bind(L_fail);
+}
+
+void SharedRuntime::gen_i2c_adapter(MacroAssembler *masm,
+                            int total_args_passed,
+                            int comp_args_on_stack,
+                            const BasicType *sig_bt,
+                            const VMRegPair *regs) {
+
+  // Note: r13 contains the senderSP on entry. We must preserve it since
+  // we may do a i2c -> c2i transition if we lose a race where compiled
+  // code goes non-entrant while we get args ready.
+
+  // In addition we use r13 to locate all the interpreter args because
+  // we must align the stack to 16 bytes.
+
+  // Adapters are frameless.
+
+  // An i2c adapter is frameless because the *caller* frame, which is
+  // interpreted, routinely repairs its own sp (from
+  // interpreter_frame_last_sp), even if a callee has modified the
+  // stack pointer.  It also recalculates and aligns sp.
+
+  // A c2i adapter is frameless because the *callee* frame, which is
+  // interpreted, routinely repairs its caller's sp (from sender_sp,
+  // which is set up via the senderSP register).
+
+  // In other words, if *either* the caller or callee is interpreted, we can
+  // get the stack pointer repaired after a call.
+
+  // This is why c2i and i2c adapters cannot be indefinitely composed.
+  // In particular, if a c2i adapter were to somehow call an i2c adapter,
+  // both caller and callee would be compiled methods, and neither would
+  // clean up the stack pointer changes performed by the two adapters.
+  // If this happens, control eventually transfers back to the compiled
+  // caller, but with an uncorrected stack, causing delayed havoc.
+
+  if (VerifyAdapterCalls &&
+      (Interpreter::code() != NULL || StubRoutines::code1() != NULL)) {
+    // So, let's test for cascading c2i/i2c adapters right now.
+    //  assert(Interpreter::contains($return_addr) ||
+    //         StubRoutines::contains($return_addr),
+    //         "i2c adapter must return to an interpreter frame");
+    __ block_comment("verify_i2c { ");
+    Label L_ok;
+    if (Interpreter::code() != NULL)
+      range_check(masm, lr, rscratch1,
+                  Interpreter::code()->code_start(), Interpreter::code()->code_end(),
+                  L_ok);
+    if (StubRoutines::code1() != NULL)
+      range_check(masm, lr, rscratch1,
+                  StubRoutines::code1()->code_begin(), StubRoutines::code1()->code_end(),
+                  L_ok);
+    if (StubRoutines::code2() != NULL)
+      range_check(masm, lr, rscratch1,
+                  StubRoutines::code2()->code_begin(), StubRoutines::code2()->code_end(),
+                  L_ok);
+    const char* msg = "i2c adapter must return to an interpreter frame";
+    __ block_comment(msg);
+    __ stop(msg);
+    __ bind(L_ok);
+    __ block_comment("} verify_i2ce ");
+  }
+
+  const int stack_space = align_up(comp_args_on_stack * VMRegImpl::stack_slot_size, StackAlignmentInBytes);
+  const int ld_high = total_args_passed *Interpreter::stackElementSize;
+  // Point to interpreter value (vs. tag)
+  const int next_off =  -Interpreter::stackElementSize; // offset from ld ptr
+  const Register loadCounter = lr;
+
+  // Align sp to StackAlignmentInBytes so compiled frame starts always aligned
+  // This is required by APCS, so all native code depends on it. The compiled
+  // Java code is not required to follow this standard however doing so
+  // simplifies the code because allows to have fixed size for compiled frames
+  __ mov(rscratch2, sp);
+  __ align_stack();
+  if(total_args_passed) {
+    // put below reserved stack space, imm12 should be enough
+    __ str(loadCounter, Address(sp, -(stack_space + wordSize)));
+
+    if(__ is_valid_for_imm12(ld_high)) {
+        __ add(loadCounter, rscratch2, ld_high);
+    } else {
+        // add operates encoded imm12, we need plain
+        __ mov(rscratch1, ld_high);
+        __ add(loadCounter, rscratch2, rscratch1);
+    }
+  }
+
+  if(comp_args_on_stack) {
+    if(__ is_valid_for_imm12(stack_space)) {
+        __ sub(sp, sp, stack_space);
+    } else {
+        // add operates encoded imm12, we need plain
+        __ mov(rscratch1, stack_space);
+        __ sub(sp, sp, rscratch1);
+    }
+  }
+
+  // +------+   -> r4
+  // |   0  | \
+  // |   1  |  \
+  // |   2  |   - >  Load in argument order going down.
+  // |   x  |  /
+  // |   N  | /
+  // +------+ -> inital sp
+  // | pad  | maybe 1 word to align the stack to 8 bytes
+  // |   M  | \
+  // |   x  |  \
+  // |   2  |    ->  Load in argument order going up.
+  // |   1  |  /
+  // |   0  | /
+  // +------+ ->
+
+
+  int sp_offset = 0;
+
+  // Now generate the shuffle code.
+  for (int i = 0; i < total_args_passed; i++) {
+
+    if (sig_bt[i] == T_VOID) {
+      assert(i > 0 && (sig_bt[i-1] == T_LONG || sig_bt[i-1] == T_DOUBLE), "missing half");
+      continue;
+    }
+
+    // Pick up 0, 1 or 2 words from SP+offset.
+
+    //
+    //
+    //
+    VMReg r_1 = regs[i].first();
+    VMReg r_2 = regs[i].second();
+    if (!r_1->is_valid()) {
+      assert(!r_2->is_valid(), "");
+      continue;
+    }
+
+    if (r_2->is_valid()) {
+      assert(i + 1 < total_args_passed && sig_bt[i + 1] == T_VOID, "going to overrwrite reg_2 value");
+    }
+
+    if (r_1->is_stack()) {
+      // Convert stack slot to an SP offset
+      int st_off = regs[i].first()->reg2stack()*VMRegImpl::stack_slot_size - sp_offset;
+
+      if (!r_2->is_valid()) {
+        __ ldr(rscratch2, Address(loadCounter, next_off, Address::pre));
+        __ str(rscratch2, Address(sp, st_off));
+      } else {
+        int tmp_off = st_off;
+        if(abs(st_off) > (255 << 2)) {
+            //st_off doesn't fit imm8 required by strd
+
+            if(__ is_valid_for_imm12(st_off)) {
+                __ add(sp, sp, st_off);
+            } else {
+                // add operates encoded imm12, NOT plain
+                __ mov(rscratch1, st_off);
+                __ add(sp, sp, rscratch1);
+            }
+            tmp_off = 0;
+            sp_offset += st_off;
+        }
+
+
+        // Interpreter local[n] == MSW, local[n+1] == LSW however locals
+        // are accessed as negative so LSW is at LOW address
+
+        // this can be a misaligned move
+    __ ldrd(rscratch1, rscratch2, Address(loadCounter, 2 * next_off, Address::pre));
+    __ strd(rscratch1, rscratch2, Address(sp, tmp_off));
+      }
+    } else if (r_1->is_Register()) {  // Register argument
+      Register r = r_1->as_Register();
+      assert(r != loadCounter, "loadCounter is reloaded");
+      if (r_2->is_valid()) {
+        assert(r_2->as_Register() != loadCounter, "loadCounter is reloaded");
+        // this can be a misaligned move
+        // ldrd can handle inconsecutive registers
+        __ ldrd(r, r_2->as_Register(), Address(loadCounter, 2 * next_off, Address::pre));
+      } else {
+        __ ldr(r, Address(loadCounter, next_off, Address::pre));
+      }
+    } else {
+      assert(r_1->is_FloatRegister(), "");
+      if (!r_2->is_valid()) {
+        // Can't do pre or post addressing for vldr, vstr
+        __ add(loadCounter, loadCounter, next_off);
+        __ vldr_f32(r_1->as_FloatRegister(), Address(loadCounter));
+      } else {
+    // TODO assert(r_2->is_FloatRegister() && r_2->as_FloatRegister() == r_1->as_FloatRegister() + 1, "");
+        // Can't do pre or post addressing for vldr, vstr
+        __ add(loadCounter, loadCounter, 2 * next_off);
+        __ vldr_f64(r_1->as_FloatRegister(), Address(loadCounter));
+      }
+    }
+  }
+
+  // restore sp
+  if(sp_offset) {
+    if(__ is_valid_for_imm12(sp_offset)) {
+        __ sub(sp, sp, sp_offset);
+    } else {
+        // add operates encoded imm12, we need plain
+        __ mov(rscratch1, sp_offset);
+        __ sub(sp, sp, rscratch1);
+    }
+  }
+
+  if(total_args_passed) {
+    // restore loadCounter
+    __ ldr(loadCounter, Address(sp, -wordSize));
+  }
+
+  // 6243940 We might end up in handle_wrong_method if
+  // the callee is deoptimized as we race thru here. If that
+  // happens we don't want to take a safepoint because the
+  // caller frame will look interpreted and arguments are now
+  // "compiled" so it is much better to make this transition
+  // invisible to the stack walking code. Unfortunately if
+  // we try and find the callee by normal means a safepoint
+  // is possible. So we stash the desired callee in the thread
+  // and the vm will find there should this case occur.
+
+  __ str(rmethod, Address(rthread, JavaThread::callee_target_offset()));
+
+  // Will jump to the compiled code just as if compiled code was doing it.
+  __ ldr(rscratch1, Address(rmethod, in_bytes(Method::from_compiled_offset())));
+  __ b(rscratch1);
+}
+
+// ---------------------------------------------------------------
+AdapterHandlerEntry* SharedRuntime::generate_i2c2i_adapters(MacroAssembler *masm,
+                                                            int total_args_passed,
+                                                            int comp_args_on_stack,
+                                                            const BasicType *sig_bt,
+                                                            const VMRegPair *regs,
+                                                            AdapterFingerPrint* fingerprint) {
+  address i2c_entry = __ pc();
+  gen_i2c_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs);
+
+  address c2i_unverified_entry = __ pc();
+  Label skip_fixup;
+
+  Label ok;
+
+  Register holder = rscratch2;
+  Register receiver = j_rarg0;
+  Register tmp = r8;  // A call-clobbered register not used for arg passing
+
+  // -------------------------------------------------------------------------
+  // Generate a C2I adapter.  On entry we know rmethod holds the Method* during calls
+  // to the interpreter.  The args start out packed in the compiled layout.  They
+  // need to be unpacked into the interpreter layout.  This will almost always
+  // require some stack space.  We grow the current (compiled) stack, then repack
+  // the args.  We  finally end in a jump to the generic interpreter entry point.
+  // On exit from the interpreter, the interpreter will restore our SP (lest the
+  // compiled code, which relys solely on SP and not FP, get sick).
+
+  {
+    __ block_comment("c2i_unverified_entry {");
+    __ load_klass(rscratch1, receiver);
+    __ ldr(tmp, Address(holder, CompiledICHolder::holder_klass_offset()));
+    __ cmp(rscratch1, tmp);
+    __ ldr(rmethod, Address(holder, CompiledICHolder::holder_metadata_offset()));
+    __ b(ok, Assembler::EQ);
+    __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+    __ bind(ok);
+    // Method might have been compiled since the call site was patched to
+    // interpreted; if that is the case treat it as a miss so we can get
+    // the call site corrected.
+    __ ldr(rscratch1, Address(rmethod, in_bytes(Method::code_offset())));
+    __ cbz(rscratch1, skip_fixup);
+    __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+    __ block_comment("} c2i_unverified_entry");
+  }
+
+  address c2i_entry = __ pc();
+
+  gen_c2i_adapter(masm, total_args_passed, comp_args_on_stack, sig_bt, regs, skip_fixup);
+
+  __ flush();
+  return AdapterHandlerLibrary::new_entry(fingerprint, i2c_entry, c2i_entry, c2i_unverified_entry);
+}
+
+int SharedRuntime::c_calling_convention(const BasicType *sig_bt,
+                                         VMRegPair *regs,
+                                         VMRegPair *regs2,
+                                         int total_args_passed) {
+  assert(regs2 == NULL, "not needed on AArch32");
+
+// We return the amount of VMRegImpl stack slots we need to reserve for all
+// the arguments NOT counting out_preserve_stack_slots.
+
+    static const Register INT_ArgReg[Argument::n_int_register_parameters_c] = {
+      c_rarg0, c_rarg1, c_rarg2, c_rarg3
+    };
+#ifdef HARD_FLOAT_CC
+    const int FP_ArgReg_N = 16;
+    static const FloatRegister FP_ArgReg[] = {
+      f0, f1, f2, f3,
+      f4, f5, f6, f7,
+      f8, f9, f10, f11,
+      f12, f13, f14, f15,
+    };
+    unsigned long fp_free_mask = (1 << FP_ArgReg_N) - 1;
+    uint fp_args = 0;
+#endif //HARD_FLOAT_CC
+
+    uint int_args = 0;
+    uint stk_args = 0;
+
+    for (int i = 0; i < total_args_passed; i++) {
+      switch (sig_bt[i]) {
+      case T_BOOLEAN:
+      case T_CHAR:
+      case T_BYTE:
+      case T_SHORT:
+      case T_INT:
+      case T_OBJECT:
+      case T_ARRAY:
+      case T_ADDRESS:
+      case T_METADATA:
+#ifndef HARD_FLOAT_CC
+      // soft FP case
+      case T_FLOAT:
+#endif
+        if (int_args < Argument::n_int_register_parameters_c) {
+          regs[i].set1(INT_ArgReg[int_args++]->as_VMReg());
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 1;
+        }
+        break;
+#ifndef HARD_FLOAT_CC
+      // soft FP case
+      case  T_DOUBLE:
+#endif
+      case T_LONG:
+        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
+        if (int_args + 1 < Argument::n_int_register_parameters_c) {
+          if ((int_args % 2) != 0) {
+            ++int_args;
+          }
+          regs[i].set2(INT_ArgReg[int_args]->as_VMReg());
+          int_args += 2;
+        } else {
+          if (stk_args % 2 != 0) {
+            ++stk_args;
+          }
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+          int_args = Argument::n_int_register_parameters_c;
+        }
+        break;
+#ifdef HARD_FLOAT_CC
+      case T_FLOAT:
+        if (fp_free_mask & ((1 << FP_ArgReg_N)-1)) {
+          unsigned index = __builtin_ctz(fp_free_mask);
+          regs[i].set1(FP_ArgReg[index]->as_VMReg());
+          fp_free_mask &= ~(1 << index);
+          fp_args += 2 * ((~index) & 1);
+        } else {
+          regs[i].set1(VMRegImpl::stack2reg(stk_args));
+          stk_args += 1;
+        }
+        break;
+      case T_DOUBLE:
+        assert((i + 1) < total_args_passed && sig_bt[i + 1] == T_VOID, "expecting half");
+        if (fp_args + 1 < FP_ArgReg_N) {
+          fp_free_mask &= ~(3 << fp_args);
+          regs[i].set2(FP_ArgReg[fp_args]->as_VMReg());
+          fp_args += 2;
+        } else {
+          regs[i].set2(VMRegImpl::stack2reg(stk_args));
+          stk_args += 2;
+        }
+        break;
+#endif //HARD_FLOAT_CC
+      case T_VOID: // Halves of longs and doubles
+        assert(i != 0 && (sig_bt[i - 1] == T_LONG || sig_bt[i - 1] == T_DOUBLE), "expecting half");
+        regs[i].set_bad();
+        break;
+      default:
+        ShouldNotReachHere();
+        break;
+      }
+    }
+
+  return align_up(stk_args, StackAlignmentInBytes/wordSize);
+}
+
+// On 64 bit we will store integer like items to the stack as
+// 64 bits items (sparc abi) even though java would only store
+// 32bits for a parameter. On 32bit it will simply be 32 bits
+// So this routine will do 32->32 on 32bit and 32->64 on 64bit
+
+static void move_int(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first()->is_stack()) {
+    if (dst.first()->is_stack()) {
+      // stack to stack
+      __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+      __ str(rscratch1, Address(sp, reg2offset_out(dst.first())));
+    } else {
+      // stack to reg
+      __ ldr(dst.first()->as_Register(), Address(rfp, reg2offset_in(src.first())));
+    }
+  } else if (dst.first()->is_stack()) {
+    // reg to stack
+    __ str(src.first()->as_Register(), Address(sp, reg2offset_out(dst.first())));
+  } else {
+    if (dst.first() != src.first()) {
+      __ mov(dst.first()->as_Register(), src.first()->as_Register());
+    }
+  }
+}
+
+// An oop arg. Must pass a handle not the oop itself
+static void object_move(MacroAssembler* masm,
+                        OopMap* map,
+                        int oop_handle_offset,
+                        int framesize_in_slots,
+                        VMRegPair src,
+                        VMRegPair dst,
+                        bool is_receiver,
+                        int* receiver_offset) {
+
+  // must pass a handle. First figure out the location we use as a handle
+
+  Register rHandle = dst.first()->is_stack() ? rscratch2 : dst.first()->as_Register();
+
+  // See if oop is NULL if it is we need no handle
+
+  if (src.first()->is_stack()) {
+
+    // Oop is already on the stack as an argument
+    int offset_in_older_frame = src.first()->reg2stack() + SharedRuntime::out_preserve_stack_slots();
+    map->set_oop(VMRegImpl::stack2reg(offset_in_older_frame + framesize_in_slots));
+    if (is_receiver) {
+      *receiver_offset = (offset_in_older_frame + framesize_in_slots) * VMRegImpl::stack_slot_size;
+    }
+
+    __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+    __ lea(rHandle, Address(rfp, reg2offset_in(src.first())));
+    // conditionally move a NULL
+    __ cmp(rscratch1, 0);
+    __ mov(rHandle, 0, Assembler::EQ);
+  } else {
+
+    // Oop is in an a register we must store it to the space we reserve
+    // on the stack for oop_handles and pass a handle if oop is non-NULL
+
+    const Register rOop = src.first()->as_Register();
+    int oop_slot;
+    if (rOop == j_rarg0)
+      oop_slot = 0;
+    else if (rOop == j_rarg1)
+      oop_slot = 1;
+    else if (rOop == j_rarg2)
+      oop_slot = 2;
+    else {
+      assert(rOop == j_rarg3, "wrong register");
+      oop_slot = 3;
+    }
+
+    oop_slot = oop_slot * VMRegImpl::slots_per_word + oop_handle_offset;
+    int offset = oop_slot*VMRegImpl::stack_slot_size;
+
+    map->set_oop(VMRegImpl::stack2reg(oop_slot));
+    // Store oop in handle area, may be NULL
+    __ str(rOop, Address(sp, offset));
+    if (is_receiver) {
+      *receiver_offset = offset;
+    }
+
+    __ cmp(rOop, 0);
+    __ lea(rHandle, Address(sp, offset));
+    // conditionally move a NULL
+    __ mov(rHandle, 0, Assembler::EQ);
+  }
+
+  // If arg is on the stack then place it otherwise it is already in correct reg.
+  if (dst.first()->is_stack()) {
+    __ str(rHandle, Address(sp, reg2offset_out(dst.first())));
+  }
+}
+
+// A float arg may have to do float reg int reg conversion
+static void float_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+    if(hasFPU()) {
+        if (src.first()->is_stack()) {
+          if (dst.first()->is_stack()) {
+            // stack to stack
+            // Have no vfp scratch registers, so copy via gpr
+            __ ldr(rscratch1, Address(rfp, reg2offset_in(src.first())));
+            __ str(rscratch1, Address(sp, reg2offset_out(dst.first())));
+          } else {
+            // stack to reg
+            __ vldr_f32(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
+          }
+        } else if (dst.first()->is_stack()) {
+          // reg to stack
+          __ vstr_f32(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
+        } else {
+#ifndef HARD_FLOAT_CC
+            if(dst.first()->is_Register()) {
+                __ vmov_f32(dst.first()->as_Register(), src.first()->as_FloatRegister());
+            } else
+#endif
+            if (dst.first() != src.first()) {
+                 __ vmov_f32(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
+            }
+        }
+    } else {
+        move_int(masm, src, dst);
+    }
+}
+
+// A long move
+static void long_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if (src.first()->is_stack()) {
+    if (dst.first()->is_stack()) {
+      // stack to stack
+      __ ldrd(rscratch1, rscratch2, Address(rfp, reg2offset_in(src.first())));
+      __ strd(rscratch1, rscratch2, Address(sp, reg2offset_out(dst.first())));
+    } else {
+      // stack to reg
+      __ ldrd(dst.first()->as_Register(), dst.second()->as_Register(),
+      Address(rfp, reg2offset_in(src.first())));
+    }
+  } else if (dst.first()->is_stack()) {
+    // reg to stack
+    __ strd(src.first()->as_Register(), src.second()->as_Register(),
+    Address(sp, reg2offset_out(dst.first())));
+  } else {
+    // reg to reg
+    if (dst.first() != src.first()) {
+      if (dst.first() != src.second()) {
+        __ mov(dst.first()->as_Register(), src.first()->as_Register());
+        __ mov(dst.second()->as_Register(), src.second()->as_Register());
+      } else {
+        __ mov(dst.second()->as_Register(), src.second()->as_Register());
+        __ mov(dst.first()->as_Register(), src.first()->as_Register());
+      }
+    }
+  }
+}
+
+// A double move
+static void double_move(MacroAssembler* masm, VMRegPair src, VMRegPair dst) {
+  if(hasFPU()) {
+    if (src.first()->is_stack()) {
+      if (dst.first()->is_stack()) {
+        // stack to stack
+        // Have no vfp scratch registers, so copy via gpr
+        __ ldrd(rscratch1, rscratch2, Address(rfp, reg2offset_in(src.first())));
+        __ strd(rscratch1, rscratch2, Address(sp, reg2offset_out(dst.first())));
+      } else {
+        // stack to reg
+        __ vldr_f64(dst.first()->as_FloatRegister(), Address(rfp, reg2offset_in(src.first())));
+      }
+    } else if (dst.first()->is_stack()) {
+      // reg to stack
+      __ vstr_f64(src.first()->as_FloatRegister(), Address(sp, reg2offset_out(dst.first())));
+    } else {
+#ifndef HARD_FLOAT_CC
+        if(dst.first()->is_Register()) {
+            __ vmov_f64(dst.first()->as_Register(), dst.second()->as_Register(), src.first()->as_FloatRegister());
+        } else
+#endif
+        if (dst.first() != src.first()) {
+           __ vmov_f64(dst.first()->as_FloatRegister(), src.first()->as_FloatRegister());
+        }
+      }
+  } else {
+    long_move(masm, src, dst);
+  }
+}
+
+
+void SharedRuntime::save_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
+  // We always ignore the frame_slots arg and just use the space just below frame pointer
+  // which by this time is free to use
+  switch (ret_type) {
+  case T_DOUBLE:
+#ifdef HARD_FLOAT_CC
+    __ vstr_f64(d0, Address(rfp, -(frame::get_frame_size() + 1) * wordSize));
+    break;
+#endif//fall through otherwise
+  case T_LONG:
+    __ strd(r0, r1, Address(rfp, -(frame::get_frame_size() + 1) * wordSize));
+    break;
+  case T_VOID:
+    break;
+  case T_FLOAT:
+#ifdef HARD_FLOAT_CC
+    __ vstr_f32(f0, Address(rfp, -frame::get_frame_size() * wordSize));
+    break;
+#endif//fall through otherwise
+  default:
+    __ str(r0, Address(rfp, -frame::get_frame_size() * wordSize));
+    break;
+  }
+}
+
+void SharedRuntime::restore_native_result(MacroAssembler *masm, BasicType ret_type, int frame_slots) {
+  // We always ignore the frame_slots arg and just use the space just below frame pointer
+  // which by this time is free to use
+  switch (ret_type) {
+  case T_DOUBLE:
+#ifdef HARD_FLOAT_CC
+    __ vldr_f64(d0, Address(rfp, -(frame::get_frame_size() + 1) * wordSize));
+    break;
+#endif//fall through otherwise
+  case T_LONG:
+    __ ldrd(r0, r1, Address(rfp, -(frame::get_frame_size() + 1) * wordSize));
+    break;
+  case T_VOID:
+    break;
+  case T_FLOAT:
+#ifdef HARD_FLOAT_CC
+    __ vldr_f32(d0, Address(rfp, -frame::get_frame_size() * wordSize));
+    break;
+#endif//fall through otherwise
+  default:
+    __ ldr(r0, Address(rfp, -frame::get_frame_size() * wordSize));
+    break;
+  }
+}
+
+static int save_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
+  RegSet x;
+  int saved_slots = 0;
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      x = x + args[i].first()->as_Register();
+      ++saved_slots;
+    }
+    if (args[i].second()->is_Register()) {
+      x = x + args[i].second()->as_Register();
+      ++saved_slots;
+    }
+#ifdef HARD_FLOAT_CC
+    else if (args[i].first()->is_FloatRegister()) {
+      FloatRegister fr = args[i].first()->as_FloatRegister();
+
+      if (args[i].second()->is_FloatRegister()) {
+        assert(args[i].is_single_phys_reg(), "doubles should be 2 consequents float regs");
+        __ decrement(sp, 2 * wordSize);
+        __ vstr_f64(fr, Address(sp));
+        saved_slots += 2;
+      } else {
+        __ decrement(sp, wordSize);
+        __ vstr_f32(fr, Address(sp));
+        ++saved_slots;
+      }
+    }
+#endif//HARD_FLOAT_CC
+  }
+  __ push(x, sp);
+  return saved_slots;
+}
+
+static void restore_args(MacroAssembler *masm, int arg_count, int first_arg, VMRegPair *args) {
+  RegSet x;
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      x = x + args[i].first()->as_Register();
+    } else {
+      ;
+    }
+    if (args[i].second()->is_Register()) {
+      x = x + args[i].second()->as_Register();
+    }
+  }
+  __ pop(x, sp);
+  for ( int i = first_arg ; i < arg_count ; i++ ) {
+    if (args[i].first()->is_Register()) {
+      ;
+    }
+#ifdef HARD_FLOAT_CC
+    else if (args[i].first()->is_FloatRegister()) {
+      FloatRegister fr = args[i].first()->as_FloatRegister();
+
+      if (args[i].second()->is_FloatRegister()) {
+    assert(args[i].is_single_phys_reg(), "doubles should be 2 consequents float regs");
+    __ vstr_f64(fr, Address(sp));
+        __ increment(sp, 2 * wordSize);
+      } else {
+    __ vstr_f32(fr, Address(sp));
+        __ increment(sp, wordSize);
+      }
+    }
+#endif//HARD_FLOAT_CC
+  }
+}
+
+
+// Check GCLocker::needs_gc and enter the runtime if it's true.  This
+// keeps a new JNI critical region from starting until a GC has been
+// forced.  Save down any oops in registers and describe them in an
+// OopMap.
+static void check_needs_gc_for_critical_native(MacroAssembler* masm,
+                                               int stack_slots,
+                                               int total_c_args,
+                                               int total_in_args,
+                                               int arg_save_area,
+                                               OopMapSet* oop_maps,
+                                               VMRegPair* in_regs,
+                                               BasicType* in_sig_bt) { Unimplemented(); }
+
+// Unpack an array argument into a pointer to the body and the length
+// if the array is non-null, otherwise pass 0 for both.
+static void unpack_array_argument(MacroAssembler* masm, VMRegPair reg, BasicType in_elem_type, VMRegPair body_arg, VMRegPair length_arg) { Unimplemented(); }
+
+
+class ComputeMoveOrder: public StackObj {
+  class MoveOperation: public ResourceObj {
+    friend class ComputeMoveOrder;
+   private:
+    VMRegPair        _src;
+    VMRegPair        _dst;
+    int              _src_index;
+    int              _dst_index;
+    bool             _processed;
+    MoveOperation*  _next;
+    MoveOperation*  _prev;
+
+    static int get_id(VMRegPair r) { Unimplemented(); return 0; }
+
+   public:
+    MoveOperation(int src_index, VMRegPair src, int dst_index, VMRegPair dst):
+      _src(src)
+    , _src_index(src_index)
+    , _dst(dst)
+    , _dst_index(dst_index)
+    , _next(NULL)
+    , _prev(NULL)
+    , _processed(false) { Unimplemented(); }
+
+    VMRegPair src() const              { Unimplemented(); return _src; }
+    int src_id() const                 { Unimplemented(); return 0; }
+    int src_index() const              { Unimplemented(); return 0; }
+    VMRegPair dst() const              { Unimplemented(); return _src; }
+    void set_dst(int i, VMRegPair dst) { Unimplemented(); }
+    int dst_index() const              { Unimplemented(); return 0; }
+    int dst_id() const                 { Unimplemented(); return 0; }
+    MoveOperation* next() const        { Unimplemented(); return 0; }
+    MoveOperation* prev() const        { Unimplemented(); return 0; }
+    void set_processed()               { Unimplemented(); }
+    bool is_processed() const          { Unimplemented(); return 0; }
+
+    // insert
+    void break_cycle(VMRegPair temp_register) { Unimplemented(); }
+
+    void link(GrowableArray<MoveOperation*>& killer) { Unimplemented(); }
+  };
+
+ private:
+  GrowableArray<MoveOperation*> edges;
+
+ public:
+  ComputeMoveOrder(int total_in_args, VMRegPair* in_regs, int total_c_args, VMRegPair* out_regs,
+                    BasicType* in_sig_bt, GrowableArray<int>& arg_order, VMRegPair tmp_vmreg) { Unimplemented(); }
+
+  // Collected all the move operations
+  void add_edge(int src_index, VMRegPair src, int dst_index, VMRegPair dst) { Unimplemented(); }
+
+  // Walk the edges breaking cycles between moves.  The result list
+  // can be walked in order to produce the proper set of loads
+  GrowableArray<MoveOperation*>* get_store_order(VMRegPair temp_register) { Unimplemented(); return 0; }
+};
+
+
+static void rt_call(MacroAssembler* masm, address dest) {
+  CodeBlob *cb = CodeCache::find_blob(dest);
+  if (cb) {
+    __ far_call(RuntimeAddress(dest), NULL);
+  } else {
+    __ lea(rscratch2, RuntimeAddress(dest));
+    __ bl(rscratch2);
+    __ maybe_isb();
+  }
+}
+
+static void verify_oop_args(MacroAssembler* masm,
+                            const methodHandle &method,
+                            const BasicType* sig_bt,
+                            const VMRegPair* regs) {
+  Register temp_reg = rscratch2;  // not part of any compiled calling seq
+  if (VerifyOops) {
+    for (int i = 0; i < method->size_of_parameters(); i++) {
+      if (sig_bt[i] == T_OBJECT ||
+          sig_bt[i] == T_ARRAY) {
+        VMReg r = regs[i].first();
+        assert(r->is_valid(), "bad oop arg");
+        if (r->is_stack()) {
+          __ ldr(temp_reg, Address(sp, r->reg2stack() * VMRegImpl::stack_slot_size));
+          __ verify_oop(temp_reg);
+        } else {
+          __ verify_oop(r->as_Register());
+        }
+      }
+    }
+  }
+}
+
+static void gen_special_dispatch(MacroAssembler* masm,
+                                 const methodHandle &method,
+                                 const BasicType* sig_bt,
+                                 const VMRegPair* regs) {
+  verify_oop_args(masm, method, sig_bt, regs);
+  vmIntrinsics::ID iid = method->intrinsic_id();
+
+  // Now write the args into the outgoing interpreter space
+  bool     has_receiver   = false;
+  Register receiver_reg   = noreg;
+  int      member_arg_pos = -1;
+  Register member_reg     = noreg;
+  int      ref_kind       = MethodHandles::signature_polymorphic_intrinsic_ref_kind(iid);
+  if (ref_kind != 0) {
+    member_arg_pos = method->size_of_parameters() - 1;  // trailing MemberName argument
+    member_reg = r4;
+    has_receiver = MethodHandles::ref_kind_has_receiver(ref_kind);
+  } else if (iid == vmIntrinsics::_invokeBasic) {
+    has_receiver = true;
+  } else {
+    fatal("unexpected intrinsic id %d", iid);
+  }
+
+  if (member_reg != noreg) {
+    // Load the member_arg into register, if necessary.
+    SharedRuntime::check_member_name_argument_is_last_argument(method, sig_bt, regs);
+    VMReg r = regs[member_arg_pos].first();
+    if (r->is_stack()) {
+      __ ldr(member_reg, Address(sp, r->reg2stack() * VMRegImpl::stack_slot_size));
+    } else {
+      // no data motion is needed
+      member_reg = r->as_Register();
+    }
+  }
+
+  if (has_receiver) {
+    // Make sure the receiver is loaded into a register.
+    assert(method->size_of_parameters() > 0, "oob");
+    assert(sig_bt[0] == T_OBJECT, "receiver argument must be an object");
+    VMReg r = regs[0].first();
+    assert(r->is_valid(), "bad receiver arg");
+    if (r->is_stack()) {
+      // Porting note:  This assumes that compiled calling conventions always
+      // pass the receiver oop in a register.  If this is not true on some
+      // platform, pick a temp and load the receiver from stack.
+      fatal("receiver always in a register");
+    } else {
+      // no data motion is needed
+      receiver_reg = r->as_Register();
+    }
+  }
+
+  // Figure out which address we are really jumping to:
+  MethodHandles::generate_method_handle_dispatch(masm, iid,
+                                                 receiver_reg, member_reg, /*for_compiler_entry:*/ true);
+}
+
+// ---------------------------------------------------------------------------
+// Generate a native wrapper for a given method.  The method takes arguments
+// in the Java compiled code convention, marshals them to the native
+// convention (handlizes oops, etc), transitions to native, makes the call,
+// returns to java state (possibly blocking), unhandlizes any result and
+// returns.
+//
+// Critical native functions are a shorthand for the use of
+// GetPrimtiveArrayCritical and disallow the use of any other JNI
+// functions.  The wrapper is expected to unpack the arguments before
+// passing them to the callee and perform checks before and after the
+// native call to ensure that they GC_locker
+// lock_critical/unlock_critical semantics are followed.  Some other
+// parts of JNI setup are skipped like the tear down of the JNI handle
+// block and the check for pending exceptions it's impossible for them
+// to be thrown.
+//
+// They are roughly structured like this:
+//    if (GC_locker::needs_gc())
+//      SharedRuntime::block_for_jni_critical();
+//    tranistion to thread_in_native
+//    unpack arrray arguments and call native entry point
+//    check for safepoint in progress
+//    check if any thread suspend flags are set
+//      call into JVM and possible unlock the JNI critical
+//      if a GC was suppressed while in the critical native.
+//    transition back to thread_in_Java
+//    return to caller
+//
+nmethod* SharedRuntime::generate_native_wrapper(MacroAssembler* masm,
+                                                const methodHandle& method,
+                                                int compile_id,
+                                                BasicType* in_sig_bt,
+                                                VMRegPair* in_regs,
+                                                BasicType ret_type) {
+  if (method->is_method_handle_intrinsic()) {
+    vmIntrinsics::ID iid = method->intrinsic_id();
+    intptr_t start = (intptr_t)__ pc();
+    int vep_offset = ((intptr_t)__ pc()) - start;
+
+    // First instruction must be a nop as it may need to be patched on deoptimisation
+    __ nop();
+    gen_special_dispatch(masm,
+                         method,
+                         in_sig_bt,
+                         in_regs);
+    int frame_complete = ((intptr_t)__ pc()) - start;  // not complete, period
+    __ flush();
+    int stack_slots = SharedRuntime::out_preserve_stack_slots();  // no out slots at all, actually
+    return nmethod::new_native_nmethod(method,
+                                       compile_id,
+                                       masm->code(),
+                                       vep_offset,
+                                       frame_complete,
+                                       stack_slots / VMRegImpl::slots_per_word,
+                                       in_ByteSize(-1),
+                                       in_ByteSize(-1),
+                                       (OopMapSet*)NULL);
+  }
+
+  bool is_critical_native = true;
+  address native_func = method->critical_native_function();
+  if (native_func == NULL) {
+    native_func = method->native_function();
+    is_critical_native = false;
+  }
+  assert(native_func != NULL, "must have function");
+
+  // An OopMap for lock (and class if static)
+  OopMapSet *oop_maps = new OopMapSet();
+  intptr_t start = (intptr_t)__ pc();
+
+  // We have received a description of where all the java arg are located
+  // on entry to the wrapper. We need to convert these args to where
+  // the jni function will expect them. To figure out where they go
+  // we convert the java signature to a C signature by inserting
+  // the hidden arguments as arg[0] and possibly arg[1] (static method)
+
+  const int total_in_args = method->size_of_parameters();
+  int total_c_args = total_in_args;
+  if (!is_critical_native) {
+    total_c_args += 1;
+    if (method->is_static()) {
+      total_c_args++;
+    }
+  } else {
+    for (int i = 0; i < total_in_args; i++) {
+      if (in_sig_bt[i] == T_ARRAY) {
+        total_c_args++;
+      }
+    }
+  }
+
+  BasicType* out_sig_bt = NEW_RESOURCE_ARRAY(BasicType, total_c_args);
+  VMRegPair* out_regs   = NEW_RESOURCE_ARRAY(VMRegPair, total_c_args);
+  BasicType* in_elem_bt = NULL;
+
+  int argc = 0;
+  if (!is_critical_native) {
+    out_sig_bt[argc++] = T_ADDRESS;
+    if (method->is_static()) {
+      out_sig_bt[argc++] = T_OBJECT;
+    }
+
+    for (int i = 0; i < total_in_args ; i++ ) {
+      out_sig_bt[argc++] = in_sig_bt[i];
+    }
+  } else {
+    Thread* THREAD = Thread::current();
+    in_elem_bt = NEW_RESOURCE_ARRAY(BasicType, total_in_args);
+    SignatureStream ss(method->signature());
+    for (int i = 0; i < total_in_args ; i++ ) {
+      if (in_sig_bt[i] == T_ARRAY) {
+        // Arrays are passed as int, elem* pair
+        out_sig_bt[argc++] = T_INT;
+        out_sig_bt[argc++] = T_ADDRESS;
+        Symbol* atype = ss.as_symbol(CHECK_NULL);
+        const char* at = atype->as_C_string();
+        if (strlen(at) == 2) {
+          assert(at[0] == '[', "must be");
+          switch (at[1]) {
+            case 'B': in_elem_bt[i]  = T_BYTE; break;
+            case 'C': in_elem_bt[i]  = T_CHAR; break;
+            case 'D': in_elem_bt[i]  = T_DOUBLE; break;
+            case 'F': in_elem_bt[i]  = T_FLOAT; break;
+            case 'I': in_elem_bt[i]  = T_INT; break;
+            case 'J': in_elem_bt[i]  = T_LONG; break;
+            case 'S': in_elem_bt[i]  = T_SHORT; break;
+            case 'Z': in_elem_bt[i]  = T_BOOLEAN; break;
+            default: ShouldNotReachHere();
+          }
+        }
+      } else {
+        out_sig_bt[argc++] = in_sig_bt[i];
+        in_elem_bt[i] = T_VOID;
+      }
+      if (in_sig_bt[i] != T_VOID) {
+        assert(in_sig_bt[i] == ss.type(), "must match");
+        ss.next();
+      }
+    }
+  }
+
+  // Now figure out where the args must be stored and how much stack space
+  // they require.
+  int out_arg_slots;
+  out_arg_slots = c_calling_convention(out_sig_bt, out_regs, NULL, total_c_args);
+
+  // Compute framesize for the wrapper.  We need to handlize all oops in
+  // incoming registers
+
+  // Calculate the total number of stack slots we will need.
+
+  // First count the abi requirement plus all of the outgoing args
+  int stack_slots = SharedRuntime::out_preserve_stack_slots() + out_arg_slots;
+
+  // Now the space for the inbound oop handle area
+  int total_save_slots = -1;
+  if (is_critical_native) {
+    // Critical natives may have to call out so they need a save area
+    // for register arguments.
+    int double_slots = 0;
+    int single_slots = 0;
+    for ( int i = 0; i < total_in_args; i++) {
+      if (in_regs[i].first()->is_Register()) {
+        const Register reg = in_regs[i].first()->as_Register();
+        switch (in_sig_bt[i]) {
+          case T_ARRAY:  // critical array (uses 2 slots on LP64)
+          case T_BOOLEAN:
+          case T_BYTE:
+          case T_SHORT:
+          case T_CHAR:
+          case T_INT:  single_slots++; break;
+          case T_LONG: double_slots++; break;
+          default:  ShouldNotReachHere();
+        }
+      } else
+#ifdef HARD_FLOAT_CC
+          if (in_regs[i].first()->is_FloatRegister())
+#endif // HARD_FLOAT_CC
+            ShouldNotReachHere();
+    }
+    total_save_slots = double_slots * 2 + single_slots;
+    // align the save area
+    if (double_slots != 0) {
+      stack_slots = align_up(stack_slots, 2);
+    }
+  } else {
+    total_save_slots = 4 * VMRegImpl::slots_per_word;  // 4 arguments passed in registers
+  }
+  assert(total_save_slots != -1, "initialize total_save_slots!");
+
+  int oop_handle_offset = stack_slots;
+  stack_slots += total_save_slots;
+
+  // Now any space we need for handlizing a klass if static method
+
+  int klass_slot_offset = 0;
+  int klass_offset = -1;
+  int lock_slot_offset = 0;
+  bool is_static = false;
+
+  if (method->is_static()) {
+    klass_slot_offset = stack_slots;
+    stack_slots += VMRegImpl::slots_per_word;
+    klass_offset = klass_slot_offset * VMRegImpl::stack_slot_size;
+    is_static = true;
+  }
+
+  // Plus a lock if needed
+
+  if (method->is_synchronized()) {
+    lock_slot_offset = stack_slots;
+    stack_slots += VMRegImpl::slots_per_word;
+  }
+
+  const int enter_frame_size = frame::get_frame_size();
+
+  // Now a place (+2) to save return values or temp during shuffling
+  // + {2,4} words which pushed by enter()
+  // (return address (which we own), saved rfp, ...)
+  stack_slots += 2 + enter_frame_size;
+
+  // Ok The space we have allocated will look like:
+  //
+  //
+  // FP-> | saved lr            |
+  //      |---------------------|
+  //      | saved fp            |
+  //      |---------------------|
+  //      | 2 slots for moves   |
+  //      |.....................|
+  //      | 1 slot opt padding  |
+  //      |---------------------|
+  //      | lock box (if sync)  |
+  //      |---------------------| <- lock_slot_offset
+  //      | klass (if static)   |
+  //      |---------------------| <- klass_slot_offset
+  //      | oopHandle area      |
+  //      |---------------------| <- oop_handle_offset (8 java arg registers)
+  //      | outbound memory     |
+  //      | based arguments     |
+  //      |                     |
+  //      |---------------------|
+  //      |                     |
+  // SP-> | out_preserved_slots |
+  //
+  //
+
+
+  // Now compute actual number of stack words we need rounding to make
+  // stack properly aligned.
+  stack_slots = align_up(stack_slots, StackAlignmentInSlots);
+
+  int stack_size = stack_slots * VMRegImpl::stack_slot_size;
+
+  // First thing make an ic check to see if we should even be here
+
+  // We are free to use all registers as temps without saving them and
+  // restoring them except rfp. rfp is the only callee save register
+  // as far as the interpreter and the compiler(s) are concerned.
+
+
+  const Register ic_reg = rscratch2;
+  const Register receiver = j_rarg0;
+
+  Label hit;
+  Label exception_pending;
+
+  assert_different_registers(ic_reg, receiver, rscratch1);
+  __ verify_oop(receiver);
+  __ cmp_klass(receiver, ic_reg, rscratch1);
+  __ b(hit, Assembler::EQ);
+
+  __ far_jump(RuntimeAddress(SharedRuntime::get_ic_miss_stub()));
+
+  // Verified entry point must be aligned
+  __ align(8);
+
+  __ bind(hit);
+
+#ifdef ASSERT
+  __ mov(ic_reg, 0xdead); // trash ic_reg(rscratch2), as used as real scratch further
+#endif
+
+  int vep_offset = ((intptr_t)__ pc()) - start;
+
+  // Generate stack overflow check
+
+  // If we have to make this method not-entrant we'll overwrite its
+  // first instruction with a jump.  For this action to be legal we
+  // must ensure that this first instruction is a B, BL, NOP, BKPT,
+  // SVC, HVC, or SMC.  Make it a NOP.
+  __ nop();
+
+  if (UseStackBanging) {
+    __ bang_stack_with_offset(JavaThread::stack_shadow_zone_size());
+  } else {
+    Unimplemented();
+  }
+
+  // Generate a new frame for the wrapper.
+  __ enter();
+  // some words are pushed by enter, so adjust frame size on this value
+  __ sub(sp, sp, stack_size - enter_frame_size * wordSize);
+
+  // Frame is now completed as far as size and linkage.
+  int frame_complete = ((intptr_t)__ pc()) - start;
+
+  if (is_critical_native) {
+    check_needs_gc_for_critical_native(masm, stack_slots, total_c_args, total_in_args,
+                                       oop_handle_offset, oop_maps, in_regs, in_sig_bt);
+  }
+
+  //
+  // We immediately shuffle the arguments so that any vm call we have to
+  // make from here on out (sync slow path, jvmti, etc.) we will have
+  // captured the oops from our caller and have a valid oopMap for
+  // them.
+
+  // -----------------
+  // The Grand Shuffle
+
+  // The Java calling convention is either equal (linux) or denser (win64) than the
+  // c calling convention. However the because of the jni_env argument the c calling
+  // convention always has at least one more (and two for static) arguments than Java.
+  // Therefore if we move the args from java -> c backwards then we will never have
+  // a register->register conflict and we don't have to build a dependency graph
+  // and figure out how to break any cycles.
+  //
+
+  // Record sp-based slot for receiver on stack for non-static methods
+  int receiver_offset = -1;
+
+  // This is a trick. We double the stack slots so we can claim
+  // the oops in the caller's frame. Since we are sure to have
+  // more args than the caller doubling is enough to make
+  // sure we can capture all the incoming oop args from the
+  // caller.
+  //
+  OopMap* map = new OopMap(stack_slots * 2, 0 /* arg_slots*/);
+
+  // Mark location of rfp (someday)
+  // map->set_callee_saved(VMRegImpl::stack2reg( stack_slots - 2), stack_slots * 2, 0, vmreg(rfp));
+
+
+#ifdef ASSERT
+  bool reg_destroyed[RegisterImpl::number_of_registers];
+  bool freg_destroyed[FloatRegisterImpl::number_of_registers];
+  for ( int r = 0 ; r < RegisterImpl::number_of_registers ; r++ ) {
+    reg_destroyed[r] = false;
+  }
+  for ( int f = 0 ; f < FloatRegisterImpl::number_of_registers ; f++ ) {
+    freg_destroyed[f] = false;
+  }
+
+#endif // ASSERT
+
+  // This may iterate in two different directions depending on the
+  // kind of native it is.  The reason is that for regular JNI natives
+  // the incoming and outgoing registers are offset upwards and for
+  // critical natives they are offset down.
+  GrowableArray<int> arg_order(2 * total_in_args);
+  VMRegPair tmp_vmreg;
+  tmp_vmreg.set2(rscratch2->as_VMReg());
+
+  if (!is_critical_native) {
+    for (int i = total_in_args - 1, c_arg = total_c_args - 1; i >= 0; i--, c_arg--) {
+      arg_order.push(i);
+      arg_order.push(c_arg);
+    }
+  } else {
+    // Compute a valid move order, using tmp_vmreg to break any cycles
+    ComputeMoveOrder cmo(total_in_args, in_regs, total_c_args, out_regs, in_sig_bt, arg_order, tmp_vmreg);
+  }
+
+  int temploc = -1;
+  for (int ai = 0; ai < arg_order.length(); ai += 2) {
+    int i = arg_order.at(ai);
+    int c_arg = arg_order.at(ai + 1);
+    __ block_comment(err_msg("move %d -> %d", i, c_arg));
+    if (c_arg == -1) {
+      assert(is_critical_native, "should only be required for critical natives");
+      // This arg needs to be moved to a temporary
+      __ mov(tmp_vmreg.first()->as_Register(), in_regs[i].first()->as_Register());
+      in_regs[i] = tmp_vmreg;
+      temploc = i;
+      continue;
+    } else if (i == -1) {
+      assert(is_critical_native, "should only be required for critical natives");
+      // Read from the temporary location
+      assert(temploc != -1, "must be valid");
+      i = temploc;
+      temploc = -1;
+    }
+#ifdef ASSERT
+    if (in_regs[i].first()->is_Register()) {
+      assert(!reg_destroyed[in_regs[i].first()->as_Register()->encoding()], "destroyed reg!");
+    } else if (in_regs[i].first()->is_FloatRegister()) {
+      assert(!freg_destroyed[in_regs[i].first()->as_FloatRegister()->encoding()], "destroyed reg!");
+    }
+    if (out_regs[c_arg].first()->is_Register()) {
+      reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
+    } else if (out_regs[c_arg].first()->is_FloatRegister()) {
+      freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
+    }
+#endif // ASSERT
+    switch (in_sig_bt[i]) {
+      case T_ARRAY:
+        if (is_critical_native) {
+          unpack_array_argument(masm, in_regs[i], in_elem_bt[i], out_regs[c_arg + 1], out_regs[c_arg]);
+          c_arg++;
+#ifdef ASSERT
+          if (out_regs[c_arg].first()->is_Register()) {
+            reg_destroyed[out_regs[c_arg].first()->as_Register()->encoding()] = true;
+          } else if (out_regs[c_arg].first()->is_FloatRegister()) {
+            freg_destroyed[out_regs[c_arg].first()->as_FloatRegister()->encoding()] = true;
+          }
+#endif
+          break;
+        }
+      case T_OBJECT:
+        assert(!is_critical_native, "no oop arguments");
+        object_move(masm, map, oop_handle_offset, stack_slots, in_regs[i], out_regs[c_arg],
+                    ((i == 0) && (!is_static)),
+                    &receiver_offset);
+        break;
+      case T_VOID:
+        break;
+
+      case T_FLOAT:
+        float_move(masm, in_regs[i], out_regs[c_arg]);
+        break;
+
+      case T_DOUBLE:
+        assert( i + 1 < total_in_args &&
+                in_sig_bt[i + 1] == T_VOID &&
+                out_sig_bt[c_arg+1] == T_VOID, "bad arg list");
+        double_move(masm, in_regs[i], out_regs[c_arg]);
+        break;
+
+      case T_LONG :
+        long_move(masm, in_regs[i], out_regs[c_arg]);
+        break;
+
+      case T_BOOLEAN :
+      case T_BYTE :
+      case T_CHAR :
+      case T_SHORT :
+      case T_INT :
+        move_int(masm, in_regs[i], out_regs[c_arg]);
+    break;
+
+      case T_ADDRESS: assert(false, "found T_ADDRESS in java args");
+      case T_NARROWOOP :
+      case T_METADATA :
+      case T_NARROWKLASS :
+      default:
+    ShouldNotReachHere();
+    }
+  }
+
+  // point c_arg at the first arg that is already loaded in case we
+  // need to spill before we call out
+  int c_arg = total_c_args - total_in_args;
+
+  // We use r4 as the oop handle for the receiver/klass
+  // It is callee save so it survives the call to native
+
+  const Register oop_handle_reg = r4;
+
+  // Pre-load a static method's oop.  Used both by locking code and
+  // the normal JNI call code.
+  if (method->is_static() && !is_critical_native) {
+
+    //  load oop into a register
+    __ movoop(oop_handle_reg,
+              JNIHandles::make_local(method->method_holder()->java_mirror()),
+              /*immediate*/true);
+
+    // Now handlize the static class mirror it's known not-null.
+    __ str(oop_handle_reg, Address(sp, klass_offset));
+    map->set_oop(VMRegImpl::stack2reg(klass_slot_offset));
+
+    // Now get the handle
+    __ lea(oop_handle_reg, Address(sp, klass_offset));
+    // store the klass handle as second argument
+    __ mov(c_rarg1, oop_handle_reg);
+    // and protect the arg if we must spill
+    c_arg--;
+  }
+
+  // Change state to native (we save the return address in the thread, since it might not
+  // be pushed on the stack when we do a a stack traversal). It is enough that the pc()
+  // points into the right code segment. It does not have to be the correct return pc.
+  // We use the same pc/oopMap repeatedly when we call out
+
+  intptr_t the_pc = (intptr_t) __ pc();
+  oop_maps->add_gc_map(the_pc - start, map);
+
+  __ set_last_Java_frame(sp, noreg, (address)the_pc, rscratch1);
+
+
+  // We have all of the arguments setup at this point. We must not touch any register
+  // argument registers at this point (what if we save/restore them there are no oop?
+
+#ifdef DTRACE_ENABLED
+  {
+    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
+    // protect the args we've loaded
+    (void) save_args(masm, total_c_args, c_arg, out_regs);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_entry),
+      rthread, c_rarg1);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+  }
+#endif
+
+  // RedefineClasses() tracing support for obsolete method entry
+  if (log_is_enabled(Trace, redefine, class, obsolete)) {
+    // protect the args we've loaded
+    save_args(masm, total_c_args, c_arg, out_regs);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+      CAST_FROM_FN_PTR(address, SharedRuntime::rc_trace_method_entry),
+      rthread, c_rarg1);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+  }
+
+  // Lock a synchronized method
+
+  // Register definitions used by locking and unlocking
+
+  Label slow_path_lock;
+  Label lock_done;
+
+  if (method->is_synchronized()) {
+    assert(!is_critical_native, "unhandled");
+
+    // registers below are not used to pass parameters
+    // and they are caller save in C1
+    // => safe to use as temporary here
+    const Register swap_reg = r5;
+    const Register obj_reg  = r6;  // Will contain the oop
+    const Register lock_reg = r7;  // Address of compiler lock object (BasicLock)
+
+    const int mark_word_offset = BasicLock::displaced_header_offset_in_bytes();
+
+    // Get the handle (the 2nd argument)
+    __ mov(oop_handle_reg, c_rarg1);
+
+    // Get address of the box
+
+    __ lea(lock_reg, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+
+    // Load the oop from the handle
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    if (UseBiasedLocking) {
+      __ biased_locking_enter(obj_reg, swap_reg, rscratch2, rscratch1, false, lock_done, &slow_path_lock);
+    }
+
+    // Load (object->mark() | 1) into swap_reg %r0
+    __ ldr(swap_reg, Address(obj_reg, oopDesc::mark_offset_in_bytes()));
+    __ orr(swap_reg, swap_reg, 1);
+
+    // Save (object->mark() | 1) into BasicLock's displaced header
+    __ str(swap_reg, Address(lock_reg, mark_word_offset));
+
+    // src -> dest iff dest == r0 else r0 <- dest
+    { Label here;
+      __ cmpxchg_obj_header(swap_reg, lock_reg, obj_reg, rscratch1, lock_done, &slow_path_lock);
+    }
+
+    // Slow path will re-enter here
+    __ bind(lock_done);
+  }
+
+
+  // Finally just about ready to make the JNI call
+
+
+  // get JNIEnv* which is first argument to native
+  if (!is_critical_native) {
+    __ lea(c_rarg0, Address(rthread, in_bytes(JavaThread::jni_environment_offset())));
+  }
+
+  // Now set thread in native
+  __ mov(rscratch1, _thread_in_native);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, rscratch2);
+
+  // Do the call
+  rt_call(masm, native_func);
+
+  // Unpack native results.
+  switch (ret_type) {
+  case T_BOOLEAN: __ c2bool(r0);             break;
+  case T_CHAR   : __ uxth(r0, r0);           break;
+  case T_BYTE   : __ sxtb(r0, r0);           break;
+  case T_SHORT  : __ sxth(r0, r0);           break;
+  case T_INT    :                            break;
+  case T_FLOAT  :
+#ifndef HARD_FLOAT_CC
+      if(hasFPU()) {
+          __ vmov_f32(d0, r0);
+      }
+#endif
+      break;
+  case T_DOUBLE :
+#ifndef HARD_FLOAT_CC
+      if(hasFPU()) {
+          __ vmov_f64(d0, r0, r1);
+      }
+#endif
+      break;
+  case T_ARRAY:                 // Really a handle
+  case T_OBJECT:                // Really a handle
+      break; // can't de-handlize until after safepoint check
+  case T_VOID: break;
+  case T_LONG: break;
+  default       : ShouldNotReachHere();
+  }
+
+  // Switch thread to "native transition" state before reading the synchronization state.
+  // This additional state is necessary because reading and testing the synchronization
+  // state is not atomic w.r.t. GC, as this scenario demonstrates:
+  //     Java thread A, in _thread_in_native state, loads _not_synchronized and is preempted.
+  //     VM thread changes sync state to synchronizing and suspends threads for GC.
+  //     Thread A is resumed to finish this native method, but doesn't block here since it
+  //     didn't see any synchronization is progress, and escapes.
+  __ mov(rscratch1, _thread_in_native_trans);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, rscratch2);
+
+  if(os::is_MP()) {
+    if (UseMembar) {
+      // Force this write out before the read below
+      __ membar(Assembler::AnyAny);
+    } else {
+      // Write serialization page so VM thread can do a pseudo remote membar.
+      // We use the current thread pointer to calculate a thread specific
+      // offset to write to within the page. This minimizes bus traffic
+      // due to cache line collision.
+      __ serialize_memory(rthread, rscratch1);
+    }
+  }
+
+  Label after_transition;
+
+  // check for safepoint operation in progress and/or pending suspend requests
+  {
+    Label Continue;
+
+    Label L;
+    __ safepoint_poll_acquire(L);
+    __ ldr(rscratch1, Address(rthread, JavaThread::suspend_flags_offset()));
+    __ cbz(rscratch1, Continue);
+    __ bind(L);
+
+    // Don't use call_VM as it will see a possible pending exception and forward it
+    // and never return here preventing us from clearing _last_native_pc down below.
+    //
+    save_native_result(masm, ret_type, stack_slots);
+    __ mov(c_rarg0, rthread);
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+    if (!is_critical_native) {
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans)));
+    } else {
+      __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans_and_transition)));
+    }
+    __ bl(rscratch1);
+    __ maybe_isb();
+    // Restore any method result value
+    restore_native_result(masm, ret_type, stack_slots);
+
+    if (is_critical_native) {
+      // The call above performed the transition to thread_in_Java so
+      // skip the transition logic below.
+      __ b(after_transition);
+    }
+
+    __ bind(Continue);
+  }
+
+  // change thread state
+  __ mov(rscratch1, _thread_in_Java);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, rscratch2);
+  __ bind(after_transition);
+
+  Label reguard;
+  Label reguard_done;
+  __ ldrb(rscratch1, Address(rthread, JavaThread::stack_guard_state_offset()));
+  __ cmp(rscratch1, JavaThread::stack_guard_yellow_reserved_disabled);
+  __ b(reguard, Assembler::EQ);
+  __ bind(reguard_done);
+
+  // native result if any is live
+
+  // Unlock
+  Label unlock_done;
+  Label slow_path_unlock;
+  if (method->is_synchronized()) {
+    const Register obj_reg  = r2;  // Will contain the oop
+    const Register lock_reg = rscratch1; // Address of compiler lock object (BasicLock)
+    const Register old_hdr  = r3;  // value of old header at unlock time
+
+    // Get locked oop from the handle we passed to jni
+    __ ldr(obj_reg, Address(oop_handle_reg, 0));
+
+    if (UseBiasedLocking) {
+      __ biased_locking_exit(obj_reg, old_hdr, unlock_done);
+    }
+
+    // Simple recursive lock?
+    // get address of the stack lock
+    __ lea(lock_reg, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+
+    //  get old displaced header
+    __ ldr(old_hdr, Address(lock_reg, 0));
+    __ cbz(old_hdr, unlock_done);
+
+    // Atomic swap old header if oop still contains the stack lock
+    Label succeed;
+    __ cmpxchg_obj_header(lock_reg, old_hdr, obj_reg, rscratch2, succeed, &slow_path_unlock);
+    __ bind(succeed);
+
+    // slow path re-enters here
+    __ bind(unlock_done);
+  }
+
+#ifdef DTRACE_ENABLED
+  {
+    SkipIfEqual skip(masm, &DTraceMethodProbes, false);
+    save_native_result(masm, ret_type, stack_slots);
+    __ mov_metadata(c_rarg1, method());
+    __ call_VM_leaf(
+         CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_method_exit),
+         rthread, c_rarg1);
+    restore_native_result(masm, ret_type, stack_slots);
+  }
+#endif
+
+  __ reset_last_Java_frame(false);
+
+  // Unbox oop result, e.g. JNIHandles::resolve result.
+  if (ret_type == T_OBJECT || ret_type == T_ARRAY) {
+    __ resolve_jobject(r0, rthread, rscratch2);
+  }
+
+  if (CheckJNICalls) {
+    // clear_pending_jni_exception_check
+    __ mov(rscratch1, 0);
+    __ str(rscratch1, Address(rthread, JavaThread::pending_jni_exception_check_fn_offset()));
+  }
+
+  if (!is_critical_native) {
+    // reset handle block
+    __ mov(rscratch1, 0);
+    __ ldr(r2, Address(rthread, JavaThread::active_handles_offset()));
+    __ str(rscratch1, Address(r2, JNIHandleBlock::top_offset_in_bytes()));
+  }
+
+  __ leave();
+
+  if (!is_critical_native) {
+    // Any exception pending?
+    __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ cbnz(rscratch1, exception_pending);
+  }
+
+  // We're done
+  __ b(lr);
+
+  // Unexpected paths are out of line and go here
+
+  if (!is_critical_native) {
+    // forward the exception
+    __ bind(exception_pending);
+
+    // and forward the exception
+    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+  }
+
+  // Slow path locking & unlocking
+  if (method->is_synchronized()) {
+
+    // BEGIN Slow path lock
+    __ bind(slow_path_lock);
+
+    // has last_Java_frame setup. No exceptions so do vanilla call not call_VM
+    // args are (oop obj, BasicLock* lock, JavaThread* thread)
+
+    // protect the args we've loaded
+    const int extra_words = save_args(masm, total_c_args, c_arg, out_regs);
+
+    __ ldr(c_rarg0, Address(oop_handle_reg));
+    __ lea(c_rarg1, Address(sp, (extra_words + lock_slot_offset) * VMRegImpl::stack_slot_size));
+    __ mov(c_rarg2, rthread);
+
+    // Not a leaf but we have last_Java_frame setup as we want
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_locking_C), 3);
+    restore_args(masm, total_c_args, c_arg, out_regs);
+
+#ifdef ASSERT
+    { Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch1, L);
+      __ stop("no pending exception allowed on exit from monitorenter");
+      __ bind(L);
+    }
+#endif
+    __ b(lock_done);
+
+    // END Slow path lock
+
+    // BEGIN Slow path unlock
+    __ bind(slow_path_unlock);
+
+    // If we haven't already saved the native result we must save it now as xmm registers
+    // are still exposed.
+
+    save_native_result(masm, ret_type, stack_slots);
+
+    __ mov(c_rarg2, rthread);
+    __ lea(c_rarg1, Address(sp, lock_slot_offset * VMRegImpl::stack_slot_size));
+    __ ldr(c_rarg0, Address(oop_handle_reg));
+
+    // Save pending exception around call to VM (which contains an EXCEPTION_MARK)
+    __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+    __ mov(rscratch2, 0);
+    __ str(rscratch2, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::complete_monitor_unlocking_C));
+
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldr(rscratch2, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cbz(rscratch2, L);
+      __ stop("no pending exception allowed on exit complete_monitor_unlocking_C");
+      __ bind(L);
+    }
+#endif // ASSERT
+
+    __ str(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+
+    restore_native_result(masm, ret_type, stack_slots);
+
+    __ b(unlock_done);
+
+    // END Slow path unlock
+
+  } // synchronized
+
+  // SLOW PATH Reguard the stack if needed
+
+  __ bind(reguard);
+  save_native_result(masm, ret_type, stack_slots);
+  rt_call(masm, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
+  restore_native_result(masm, ret_type, stack_slots);
+  // and continue
+  __ b(reguard_done);
+
+
+
+  __ flush();
+
+  nmethod *nm = nmethod::new_native_nmethod(method,
+                                            compile_id,
+                                            masm->code(),
+                                            vep_offset,
+                                            frame_complete,
+                                            stack_slots / VMRegImpl::slots_per_word,
+                                            (is_static ? in_ByteSize(klass_offset) : in_ByteSize(receiver_offset)),
+                                            in_ByteSize(lock_slot_offset*VMRegImpl::stack_slot_size),
+                                            oop_maps);
+
+  if (is_critical_native) {
+    nm->set_lazy_critical_native(true);
+  }
+
+  return nm;
+}
+
+// this function returns the adjust size (in number of words) to a c2i adapter
+// activation for use during deoptimization
+int Deoptimization::last_frame_adjust(int callee_parameters, int callee_locals) {
+  assert(callee_locals >= callee_parameters,
+          "test and remove; got more parms than locals");
+  if (callee_locals < callee_parameters)
+    return 0;                   // No adjustment for negative locals
+  int diff = (callee_locals - callee_parameters) * Interpreter::stackElementWords;
+  // diff is counted in stack words
+  return align_up(diff, 2);
+}
+
+
+//------------------------------generate_deopt_blob----------------------------
+void SharedRuntime::generate_deopt_blob() {
+
+  // Allocate space for the code
+  ResourceMark rm;
+  // Setup code generation tools
+  CodeBuffer buffer("deopt_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+  int frame_size_in_words;
+  OopMap* map = NULL;
+  OopMapSet *oop_maps = new OopMapSet();
+
+  // -------------
+  // This code enters when returning to a de-optimized nmethod.  A return
+  // address has been pushed on the the stack, and return values are in
+  // registers.
+  // If we are doing a normal deopt then we were called from the patched
+  // nmethod from the point we returned to the nmethod. So the return
+  // address on the stack is wrong by NativeCall::instruction_size
+  // We will adjust the value so it looks like we have the original return
+  // address on the stack (like when we eagerly deoptimized).
+  // In the case of an exception pending when deoptimizing, we enter
+  // with a return address on the stack that points after the call we patched
+  // into the exception handler. We have the following register state from,
+  // e.g., the forward exception stub (see stubGenerator_x86_64.cpp).
+  //    r0: exception oop
+  //    r7: exception handler
+  //    r3: throwing pc
+  // So in this case we simply jam r3 into the useless return address and
+  // the stack looks just like we want.
+  //
+  // At this point we need to de-opt.  We save the argument return
+  // registers.  We call the first C routine, fetch_unroll_info().  This
+  // routine captures the return values and returns a structure which
+  // describes the current frame size and the sizes of all replacement frames.
+  // The current frame is compiled code and may contain many inlined
+  // functions, each with their own JVM state.  We pop the current frame, then
+  // push all the new frames.  Then we call the C routine unpack_frames() to
+  // populate these frames.  Finally unpack_frames() returns us the new target
+  // address.  Notice that callee-save registers are BLOWN here; they have
+  // already been captured in the vframeArray at the time the return PC was
+  // patched.
+  address start = __ pc();
+  Label cont;
+
+  // Prolog for non exception case!
+
+  // Save everything in sight.
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, true);
+
+  // Normal deoptimization.  Save exec mode for unpack_frames.
+  __ mov(r7, Deoptimization::Unpack_deopt); // callee-saved
+  __ b(cont);
+
+  int reexecute_offset = __ pc() - start;
+
+  // Reexecute case
+  // return address is the pc describes what bci to do re-execute at
+
+  // No need to update map as each call to save_live_registers will produce identical oopmap
+  (void) RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  __ mov(r7, Deoptimization::Unpack_reexecute); // callee-saved
+  __ b(cont);
+
+  int exception_offset = __ pc() - start;
+
+  // Prolog for exception case
+
+  // all registers are dead at this entry point, except for r0, and
+  // r3 which contain the exception oop and exception pc
+  // respectively.  Set them in TLS and fall thru to the
+  // unpack_with_exception_in_tls entry point.
+
+  __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+
+  int exception_in_tls_offset = __ pc() - start;
+
+  // new implementation because exception oop is now passed in JavaThread
+
+  // Prolog for exception case
+  // All registers must be preserved because they might be used by LinearScan
+  // Exceptiop oop and throwing PC are passed in JavaThread
+  // tos: stack at point of call to method that threw the exception (i.e. only
+  // args are on the stack, no return address)
+
+  // The return address pushed by save_live_registers will be patched
+  // later with the throwing pc. The correct value is not available
+  // now because loading it from memory would destroy registers.
+
+  // NB: The SP at this point must be the SP of the method that is
+  // being deoptimized.  Deoptimization assumes that the frame created
+  // here by save_live_registers is immediately below the method's SP.
+  // This is a somewhat fragile mechanism.
+
+  // Save everything in sight.
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  // Now it is safe to overwrite any register
+
+  // Deopt during an exception.  Save exec mode for unpack_frames.
+  __ mov(r7, Deoptimization::Unpack_exception); // callee-saved
+
+  // load throwing pc from JavaThread and patch it as the return address
+  // of the current frame. Then clear the field in JavaThread
+
+  __ ldr(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ str(r3, Address(rfp, frame::get_return_addr_offset() * wordSize));
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+
+#ifdef ASSERT
+  // verify that there is really an exception oop in JavaThread
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ verify_oop(r0);
+
+  // verify that there is no pending exception
+  Label no_pending_exception;
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cbz(rscratch1, no_pending_exception);
+  __ stop("must not have pending exception here");
+  __ bind(no_pending_exception);
+#endif
+
+  __ bind(cont);
+
+  // Call C code.  Need thread and this frame, but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.
+  //
+  // UnrollBlock* fetch_unroll_info(JavaThread* thread)
+
+  // fetch_unroll_info needs to call last_java_frame().
+
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+#ifdef ASSERT0
+  { Label L;
+    __ ldr(rscratch1, Address(rthread,
+                              JavaThread::last_Java_fp_offset()));
+    __ cbz(rscratch1, L);
+    __ stop("SharedRuntime::generate_deopt_blob: last_Java_fp not cleared");
+    __ bind(L);
+  }
+#endif // ASSERT
+  __ mov(c_rarg0, rthread);
+  __ mov(c_rarg1, r7); // rcpool
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::fetch_unroll_info)));
+  __ bl(rscratch1);
+  __ bind(retaddr);
+
+  // Need to have an oopmap that tells fetch_unroll_info where to
+  // find any register it might need.
+  oop_maps->add_gc_map(__ pc() - start, map);
+
+  __ reset_last_Java_frame(false);
+
+  // Load UnrollBlock* into r5
+  __ mov(r5, r0);
+
+   Label noException;
+  __ cmp(r7, Deoptimization::Unpack_exception);   // Was exception pending?
+  __ b(noException, Assembler::NE);
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  // QQQ this is useless it was NULL above
+  __ ldr(r3, Address(rthread, JavaThread::exception_pc_offset()));
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+
+  __ verify_oop(r0);
+
+  // Overwrite the result registers with the exception results.
+  __ str(r0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::r0_off)));
+  // I think this is useless
+  // __ str(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));
+
+  __ bind(noException);
+
+  // Only register save data is on the stack.
+  // Now restore the result registers.  Everything else is either dead
+  // or captured in the vframeArray.
+  RegisterSaver::restore_result_registers(masm);
+
+  // All of the register save area has been popped of the stack. Only the
+  // return address remains.
+
+  // Pop all the frames we must move/replace.
+  //
+  // Frame picture (youngest to oldest)
+  // 1: self-frame (no frame link)
+  // 2: deopting frame  (no frame link)
+  // 3: caller of deopting frame (could be compiled/interpreted).
+  //
+  // Note: by leaving the return address of self-frame on the stack
+  // and using the size of frame 2 to adjust the stack
+  // when we are done the return to frame 3 will still be on the stack.
+
+  // Pop deoptimized frame
+  __ ldr(r2, Address(r5, Deoptimization::UnrollBlock::size_of_deoptimized_frame_offset_in_bytes()));
+  __ sub(r2, r2, frame::get_frame_size() * wordSize);
+  __ add(sp, sp, r2);
+  if (FrameAPCS) {
+    // frame constructed with
+    // push    {r11, r12, lr, pc}
+    __ ldr(rfp, __ post(sp, 2 * wordSize));
+    __ ldr(lr,  __ post(sp, 2 * wordSize));
+  } else {
+    __ ldrd(rfp, lr, __ post(sp, 2 * wordSize));
+  }
+  // LR should now be the return address to the caller (3)
+
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  if (UseStackBanging) {
+    __ ldr(rscratch2, Address(r5, Deoptimization::UnrollBlock::total_frame_sizes_offset_in_bytes()));
+    __ bang_stack_size(rscratch2, r2);
+  }
+#endif
+  // Load address of array of frame pcs into r2
+  __ ldr(r2, Address(r5, Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+
+  // Trash the old pc
+  // __ addptr(sp, wordSize);  FIXME ????
+
+  // Load address of array of frame sizes into r4
+  __ ldr(r4, Address(r5, Deoptimization::UnrollBlock::frame_sizes_offset_in_bytes()));
+
+  // Load counter into r3
+  __ ldr(r3, Address(r5, Deoptimization::UnrollBlock::number_of_frames_offset_in_bytes()));
+
+  // Now adjust the caller's stack to make up for the extra locals
+  // but record the original sp so that we can save it in the skeletal interpreter
+  // frame and the stack walking of interpreter_sender will get the unextended sp
+  // value and not the "real" sp value.
+
+  const Register sender_sp = r6;
+
+  __ mov(sender_sp, sp);
+  __ ldr(rscratch1, Address(r5,
+                       Deoptimization::UnrollBlock::
+                       caller_adjustment_offset_in_bytes()));
+  __ sub(sp, sp, rscratch1);
+
+  // Push interpreter frames in a loop
+  __ mov(rscratch1, (address)0xDEADDEAD);        // Make a recognizable pattern
+  // Initially used to place 0xDEADDEAD in rscratch2 as well - why?
+  __ mov(rscratch2, 0);
+  Label loop;
+  __ bind(loop);
+  __ ldr(rscratch1, Address(__ post(r4, wordSize)));          // Load frame size
+  __ sub(rscratch1, rscratch1, frame::get_frame_size() * wordSize); // We'll push frame backtrace by hand
+  __ ldr(lr, Address(__ post(r2, wordSize)));  // Load pc
+  __ enter();                           // Save old & set new fp
+  __ sub(sp, sp, rscratch1);                  // Prolog
+  // This value is corrected by layout_activation_impl
+  __ str(rscratch2, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  __ str(sender_sp, Address(rfp, frame::get_interpreter_frame_sender_sp_offset() * wordSize)); // Make it walkable
+  __ mov(sender_sp, sp);               // Pass sender_sp to next frame
+  __ sub(r3, r3, 1);                   // Decrement counter
+  __ cbnz(r3, loop);
+
+    // Re-push self-frame
+  __ ldr(lr, Address(r2));
+  __ enter();
+
+  // Allocate a full sized register save area.  We subtract frame::get_frame_size() words,
+  // because enter() just pushed them.
+  __ sub(sp, sp, (frame_size_in_words - frame::get_frame_size()) * wordSize);
+
+  // Restore frame locals after moving the frame
+  if(hasFPU()) {
+    __ vstr_f64(d0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::fpu_state_off)));
+  }
+  __ strd(r0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::r0_off)));
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // restore return values to their stack-slots with the new SP.
+  //
+  // void Deoptimization::unpack_frames(JavaThread* thread, int exec_mode)
+
+  // Use rfp because the frames look interpreted now
+  // Don't need the precise return PC here, just precise enough to point into this code blob.
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
+
+  __ mov(c_rarg0, rthread);
+  __ mov(c_rarg1, r7); // second arg: exec_mode
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
+  __ bl(rscratch1);
+
+  // Set an oopmap for the call site
+  // Use the same PC we used for the last java frame
+  oop_maps->add_gc_map(the_pc - start,
+                       new OopMap( frame_size_in_words, 0 ));
+
+  // Clear fp AND pc
+  __ reset_last_Java_frame(true);
+
+  // Collect return values
+  if(hasFPU()) {
+    __ vldr_f64(d0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::fpu_state_off)));
+  }
+  __ ldrd(r0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::r0_off)));
+  // I think this is useless (throwing pc?)
+  // __ ldr(r3, Address(sp, RegisterSaver::r3_offset_in_bytes()));
+
+  // Pop self-frame.
+  __ leave();                           // Epilog
+
+  // Jump to interpreter
+  __ b(lr);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  _deopt_blob = DeoptimizationBlob::create(&buffer, oop_maps, 0, exception_offset, reexecute_offset, frame_size_in_words);
+  _deopt_blob->set_unpack_with_exception_in_tls_offset(exception_in_tls_offset);
+
+}
+
+uint SharedRuntime::out_preserve_stack_slots() {
+  return 0;
+}
+
+#if COMPILER2_OR_JVMCI
+//------------------------------generate_uncommon_trap_blob--------------------
+void SharedRuntime::generate_uncommon_trap_blob() {
+  // Allocate space for the code
+  ResourceMark rm;
+  // Setup code generation tools
+  CodeBuffer buffer("uncommon_trap_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  assert(SimpleRuntimeFrame::framesize % 4 == 0, "sp not 16-byte aligned");
+
+  address start = __ pc();
+
+  // Push self-frame.  We get here with a return address in LR
+  // and sp should be 16 byte aligned
+  // push rfp and retaddr by hand
+  __ enter();
+  // we don't expect an arg reg save area
+#ifndef PRODUCT
+  assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+  // compiler left unloaded_class_index in j_rarg0
+  __ mov(c_rarg1, j_rarg0);
+
+  // we need to set the past SP to the stack pointer of the stub frame
+  // and the pc to the address where this runtime call will return
+  // although actually any pc in this code blob will do).
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // capture callee-saved registers as well as return values.
+  //
+  // UnrollBlock* uncommon_trap(JavaThread* thread, jint unloaded_class_index);
+  //
+  // n.b. 2 gp args, 0 fp args, integral return type
+
+  __ mov(c_rarg0, rthread);
+  __ mov(c_rarg2, (unsigned)Deoptimization::Unpack_uncommon_trap);
+  __ lea(rscratch1,
+         RuntimeAddress(CAST_FROM_FN_PTR(address,
+                                         Deoptimization::uncommon_trap)));
+  __ bl(rscratch1);
+  __ bind(retaddr);
+
+  // Set an oopmap for the call site
+  OopMapSet* oop_maps = new OopMapSet();
+  OopMap* map = new OopMap(SimpleRuntimeFrame::framesize, 0);
+
+  // location of rfp is known implicitly by the frame sender code
+
+  oop_maps->add_gc_map(__ pc() - start, map);
+
+  __ reset_last_Java_frame(false);
+
+  // move UnrollBlock* into r4
+  __ mov(r4, r0);
+
+  // Pop all the frames we must move/replace.
+  //
+  // Frame picture (youngest to oldest)
+  // 1: self-frame (no frame link)
+  // 2: deopting frame  (no frame link)
+  // 3: caller of deopting frame (could be compiled/interpreted).
+
+  // Pop self-frame
+  __ leave();
+
+  // Pop deoptimized frame (int)
+  __ ldr(r2, Address(r4,
+                     Deoptimization::UnrollBlock::
+                     size_of_deoptimized_frame_offset_in_bytes()));
+  __ add(sp, sp, r2);
+
+#ifdef ASSERT
+  // Compilers generate code that bang the stack by as much as the
+  // interpreter would need. So this stack banging should never
+  // trigger a fault. Verify that it does not on non product builds.
+  if (UseStackBanging) {
+    // The compiled method that we are deoptimizing was popped from the stack.
+    // If the stack bang results in a stack overflow, we don't return to the
+    // method that is being deoptimized. The stack overflow exception is
+    // propagated to the caller of the deoptimized method. Need to get the pc
+    // from the caller in LR and restore FP.
+    __ ldr(r2, Address(r4,
+                       Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+    __ ldr(lr, Address(r2, 0));
+    __ ldr(rfp, Address(r4, Deoptimization::UnrollBlock::initial_info_offset_in_bytes()));
+    __ ldr(r1, Address(r4,
+                       Deoptimization::UnrollBlock::
+                       total_frame_sizes_offset_in_bytes()));
+    __ bang_stack_size(r1, r2);
+  }
+#endif
+  // Now is the time to restore frameptr. Need to take what was in the frame header
+  // since it can be real FP if previous frame was interpreted/C1 or arbitrary value if C2
+  __ ldr(rfp, Address(sp, -2*wordSize)/*Address(r4,
+                    Deoptimization::UnrollBlock::initial_info_offset_in_bytes())*/);
+
+  // Load address of array of frame pcs into r2 (address*)
+  __ ldr(r2, Address(r4,
+                     Deoptimization::UnrollBlock::frame_pcs_offset_in_bytes()));
+
+  // Load address of array of frame sizes into r5 (intptr_t*)
+  __ ldr(r5, Address(r4,
+                     Deoptimization::UnrollBlock::
+                     frame_sizes_offset_in_bytes()));
+
+  // Counter
+  __ ldr(r3, Address(r4,
+                     Deoptimization::UnrollBlock::
+                     number_of_frames_offset_in_bytes())); // (int)
+
+  // Now adjust the caller's stack to make up for the extra locals but
+  // record the original sp so that we can save it in the skeletal
+  // interpreter frame and the stack walking of interpreter_sender
+  // will get the unextended sp value and not the "real" sp value.
+
+  const Register sender_sp = r7;
+
+  __ mov(sender_sp, sp);
+  __ ldr(r1, Address(r4,
+                     Deoptimization::UnrollBlock::
+                     caller_adjustment_offset_in_bytes())); // (int)
+  __ sub(sp, sp, r1);
+
+  __ mov(rscratch1, 0);
+  // Push interpreter frames in a loop
+  Label loop;
+  __ bind(loop);
+  __ ldr(r1, __ post(r5, wordSize));   // Load frame size
+  __ sub(r1, r1, 2 * wordSize);        // We'll push pc and rfp by hand
+  __ ldr(lr, __ post(r2, wordSize));   // Save return address
+  __ enter();                          // and old rfp & set new rfp
+  __ sub(sp, sp, r1);                  // Prolog
+  __ str(sender_sp, Address(rfp, frame::get_interpreter_frame_sender_sp_offset() * wordSize)); // Make it walkable
+  // This value is corrected by layout_activation_impl
+  __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize)); //zero it
+  __ mov(sender_sp, sp);               // Pass sender_sp to next frame
+  __ subs(r3, r3, 1);                  // Decrement counter
+  __ b(loop, Assembler::GT);
+  __ ldr(lr, Address(r2, 0));          // save final return address
+  // Re-push self-frame
+  __ enter();                          // & old rfp & set new rfp
+
+  // Use rfp because the frames look interpreted now
+  // Save "the_pc" since it cannot easily be retrieved using the last_java_SP after we aligned SP.
+  // Don't need the precise return PC here, just precise enough to point into this code blob.
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, rfp, the_pc, rscratch1);
+
+  // Call C code.  Need thread but NOT official VM entry
+  // crud.  We cannot block on this call, no GC can happen.  Call should
+  // restore return values to their stack-slots with the new SP.
+  // Thread is in rdi already.
+  //
+  // BasicType unpack_frames(JavaThread* thread, int exec_mode);
+  //
+  // n.b. 2 gp args, 0 fp args, integral return type
+
+  // sp should already be aligned
+  __ mov(c_rarg0, rthread);
+  __ mov(c_rarg1, (unsigned)Deoptimization::Unpack_uncommon_trap);
+  __ lea(rscratch1, RuntimeAddress(CAST_FROM_FN_PTR(address, Deoptimization::unpack_frames)));
+  __ bl(rscratch1);
+
+  // Set an oopmap for the call site
+  // Use the same PC we used for the last java frame
+  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
+
+  // Clear fp AND pc
+  __ reset_last_Java_frame(true);
+
+  // Pop self-frame.
+  __ leave();                 // Epilog
+
+  // Jump to interpreter
+  __ b(lr);
+
+  // Make sure all code is generated
+  masm->flush();
+
+  _uncommon_trap_blob =  UncommonTrapBlob::create(&buffer, oop_maps,
+                                                 SimpleRuntimeFrame::framesize >> 1);
+}
+#endif // COMPILER2_OR_JVMCI
+
+
+//------------------------------generate_handler_blob------
+//
+// Generate a special Compile2Runtime blob that saves all registers,
+// and setup oopmap.
+//
+SafepointBlob* SharedRuntime::generate_handler_blob(address call_ptr, int poll_type) {
+  ResourceMark rm;
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* map;
+
+  // Allocate space for the code.  Setup code generation tools.
+  CodeBuffer buffer("handler_blob", 2048, 1024);
+  MacroAssembler* masm = new MacroAssembler(&buffer);
+
+  address start   = __ pc();
+  address call_pc = NULL;
+  int frame_size_in_words;
+  bool cause_return = (poll_type == POLL_AT_RETURN);
+
+  // If cause_return is true we are at a poll_return and there is
+  // the return address on the stack to the caller on the nmethod
+  // that is safepoint. We can leave this return on the stack and
+  // effectively complete the return and safepoint in the caller.
+  // Otherwise we push space for a return address that the safepoint
+  // handler will install later to make the stack walking sensible.
+  if (!cause_return) {
+    __ sub(sp, sp, wordSize); // make room for return address
+  }
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words, !cause_return);
+
+  // The following is basically a call_VM.  However, we need the precise
+  // address of the call in order to generate an oopmap. Hence, we do all the
+  // work outselves.
+
+  Label retaddr;
+  __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+  // The return address must always be correct so that frame constructor never
+  // sees an invalid pc.
+
+  if (!cause_return) {
+    // overwrite the return address pushed by save_live_registers
+    // Additionally, r5 is a callee-saved register so we can look at
+    // it later to determine if someone changed the return address for
+    // us!
+    __ ldr(r5, Address(rthread, JavaThread::saved_exception_pc_offset()));
+    __ str(r5, Address(rfp, frame::get_return_addr_offset() * wordSize));
+  }
+
+  // Do the call
+  __ mov(c_rarg0, rthread);
+  __ lea(rscratch1, RuntimeAddress(call_ptr));
+  __ bl(rscratch1);
+  __ bind(retaddr);
+
+  // Set an oopmap for the call site.  This oopmap will map all
+  // oop-registers and debug-info registers as callee-saved.  This
+  // will allow deoptimization at this safepoint to find all possible
+  // debug-info recordings, as well as let GC find all oops.
+
+  oop_maps->add_gc_map( __ pc() - start, map);
+
+  Label noException, no_adjust, bail;
+
+  __ reset_last_Java_frame(false);
+
+  __ maybe_isb();
+  __ membar(Assembler::LoadLoad | Assembler::LoadStore);
+
+  if (SafepointMechanism::uses_thread_local_poll() && !cause_return) {
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbnz(rscratch1, no_adjust);
+
+    // If our stashed return pc was modified by the runtime we avoid touching it
+    __ ldr(rscratch1, Address(rfp, frame::get_return_addr_offset() * wordSize));
+    __ cmp(r5, rscratch1);
+    __ b(no_adjust, Assembler::NE);
+
+#ifdef ASSERT
+    // Verify the correct encoding of the poll we're about to skip.
+    // ldr(r12, [r12, #0]);
+    __ ldr(rscratch1, Address(r5));
+    __ bic(rscratch1, rscratch1, ~0xfff0ffff);
+    __ mov(rscratch2, 0xe590c000);
+    __ cmp(rscratch1, rscratch2);
+    __ b(bail, Assembler::NE);
+#endif
+    // Adjust return pc forward to step over the safepoint poll instruction
+    __ add(r5, r5, NativeInstruction::arm_insn_sz);
+    __ str(r5, Address(rfp, frame::get_return_addr_offset() * wordSize));
+  }
+
+  __ bind(no_adjust);
+
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cmp(rscratch1, 0);
+
+  // does not kill flags
+  RegisterSaver::restore_live_registers(masm, cause_return);
+  // for !POLL_AT_RETURN the stack has return address on it
+
+  __ b(noException, Assembler::EQ);
+
+  // Exception pending
+  if (cause_return)
+    __ mov(r3, lr);
+  else
+    __ pop(r3);
+  __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+  // No exception case
+  __ bind(noException);
+
+  if (cause_return)
+    __ b(lr);
+  else
+    __ pop(r15_pc);
+
+#ifdef ASSERT
+  __ bind(bail);
+  __ stop("Attempting to adjust pc to skip safepoint poll but the return point is not what we expected");
+#endif
+
+  // Make sure all code is generated
+  masm->flush();
+
+  // Fill-out other meta info
+  return SafepointBlob::create(&buffer, oop_maps, frame_size_in_words);
+}
+
+//
+// generate_resolve_blob - call resolution (static/virtual/opt-virtual/ic-miss
+//
+// Generate a stub that calls into vm to find out the proper destination
+// of a java call. All the argument registers are live at this point
+// but since this is generic code we don't know what they are and the caller
+// must do any gc of the args.
+//
+RuntimeStub* SharedRuntime::generate_resolve_blob(address destination, const char* name) {
+  assert (StubRoutines::forward_exception_entry() != NULL, "must be generated before");
+
+  // allocate space for the code
+  ResourceMark rm;
+
+  //CodeBuffer buffer(name, 1000, 512);
+  CodeBuffer buffer(name, 2048, 512 ); // changed as error later
+  MacroAssembler* masm                = new MacroAssembler(&buffer);
+
+  int frame_size_in_words;
+
+  OopMapSet *oop_maps = new OopMapSet();
+  OopMap* map = NULL;
+
+  int start = __ offset();
+
+  map = RegisterSaver::save_live_registers(masm, 0, &frame_size_in_words);
+
+  int frame_complete = __ offset();
+
+  {
+    Label retaddr;
+    __ set_last_Java_frame(sp, noreg, retaddr, rscratch1);
+
+    __ mov(c_rarg0, rthread);
+    __ lea(rscratch1, RuntimeAddress(destination));
+
+    __ bl(rscratch1);
+    __ bind(retaddr);
+  }
+
+  // Set an oopmap for the call site.
+  // We need this not only for callee-saved registers, but also for volatile
+  // registers that the compiler might be keeping live across a safepoint.
+
+  oop_maps->add_gc_map( __ offset() - start, map);
+
+  __ maybe_isb();
+
+  // r0 contains the address we are going to jump to assuming no exception got installed
+
+  // clear last_Java_sp
+  __ reset_last_Java_frame(false);
+  // check for pending exceptions
+  Label pending;
+  __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+  __ cbnz(rscratch1, pending);
+
+  // get the returned Method*
+  __ get_vm_result_2(rmethod, rthread);
+  __ str(rmethod, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::rmethod_off)));
+
+  // r0 is where we want to jump, overwrite rscratch1 which is saved and scratch
+  __ str(r0, Address(sp, RegisterSaver::offset_in_bytes(RegisterSaver::rscratch1_off)));
+  RegisterSaver::restore_live_registers(masm);
+
+  // We are back the the original state on entry and ready to go.
+
+  __ b(rscratch1);
+
+  // Pending exception after the safepoint
+
+  __ bind(pending);
+
+  RegisterSaver::restore_live_registers(masm);
+
+  // exception pending => remove activation and forward to exception handler
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rthread, JavaThread::vm_result_offset()));
+
+  __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+  __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  // return the  blob
+  // frame_size_words or bytes??
+  return RuntimeStub::new_runtime_stub(name, &buffer, frame_complete, frame_size_in_words, oop_maps, true);
+}
+
+
+#if COMPILER2_OR_JVMCI
+// This is here instead of runtime_x86_64.cpp because it uses SimpleRuntimeFrame
+//
+//------------------------------generate_exception_blob---------------------------
+// creates exception blob at the end
+// Using exception blob, this code is jumped from a compiled method.
+// (see emit_exception_handler in x86_64.ad file)
+//
+// Given an exception pc at a call we call into the runtime for the
+// handler in this method. This handler might merely restore state
+// (i.e. callee save registers) unwind the frame and jump to the
+// exception handler for the nmethod if there is no Java level handler
+// for the nmethod.
+//
+// This code is entered with a jmp.
+//
+// Arguments:
+//   r0: exception oop
+//   r3: exception pc
+//
+// Results:
+//   r0: exception oop
+//   r3: exception pc in caller or ???
+//   destination: exception handler of caller
+//
+// Note: the exception pc MUST be at a call (precise debug information)
+//       Registers r0, r3, r2, r4, r5, r8-r11 are not callee saved.
+//
+
+void OptoRuntime::generate_exception_blob() {
+  // allocate space for code
+  ResourceMark rm;
+  int pad = VerifyThread ? 256 : 0;// Extra slop space for more verify code
+
+  // setup code generation tools
+  // Measured 8/7/03 at 256 in 32bit debug build (no VerifyThread)
+  // Measured 8/7/03 at 528 in 32bit debug build (VerifyThread)
+  CodeBuffer buffer("exception_blob", 600+pad, 512);
+  MacroAssembler* masm     = new MacroAssembler(&buffer);
+
+  int framesize_in_words = 2; // FP + LR
+  int framesize_in_bytes = framesize_in_words * wordSize;
+  int framesize_in_slots = framesize_in_bytes / sizeof(jint);
+
+  address start = __ pc();
+
+  __ str(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ str(r3, Address(rthread, JavaThread::exception_pc_offset()));
+
+  // This call does all the hard work. It checks if an exception catch
+  // exists in the method.
+  // If so, it returns the handler address.
+  // If the nmethod has been deoptimized and it had a handler the handler
+  // address is the deopt blob unpack_with_exception entry.
+  //
+  // If no handler exists it prepares for stack-unwinding, restoring the callee-save
+  // registers of the frame being removed.
+  //
+  __ mov(lr, r3);
+  __ enter();
+  address the_pc = __ pc();
+  __ set_last_Java_frame(sp, noreg, the_pc, rscratch1);
+
+  __ mov(r0, rthread);
+
+  // This call can block at exit and nmethod can be deoptimized at that
+  // point. If the nmethod had a catch point we would jump to the
+  // now deoptimized catch point and fall thru the vanilla deopt
+  // path and lose the exception
+  // Sure would be simpler if this call didn't block!
+  __ call(CAST_FROM_FN_PTR(address, OptoRuntime::handle_exception_C));
+
+  // Set an oopmap for the call site.  This oopmap will only be used if we
+  // are unwinding the stack.  Hence, all locations will be dead.
+  // Callee-saved registers will be the same as the frame above (i.e.,
+  // handle_exception_stub), since they were restored when we got the
+  // exception.
+
+  OopMapSet* oop_maps = new OopMapSet();
+
+  oop_maps->add_gc_map(the_pc - start, new OopMap(SimpleRuntimeFrame::framesize, 0));
+
+  __ reset_last_Java_frame(false);
+
+  __ leave();
+
+  // Restore SP from its saved reg (FP) if the exception PC is a MethodHandle call site.
+  __ ldr(rscratch1, Address(rthread, JavaThread::is_method_handle_return_offset()));
+  __ cmp(rscratch1, 0);
+  __ mov(sp, rfp, Assembler::NE);
+
+  // We have a handler in r0 (could be deopt blob).
+  __ mov(rscratch2, r0);
+
+  // Since this may be the deopt blob we must set R3 to look like we returned
+  // from the original pc that threw the exception
+
+  __ ldr(r3,  Address(rthread, JavaThread::exception_pc_offset()));
+
+  __ ldr(r0, Address(rthread, JavaThread::exception_oop_offset()));
+  __ mov(rscratch1, 0);
+#ifdef ASSERT
+  __ str(rscratch1, Address(rthread, JavaThread::exception_handler_pc_offset()));
+  __ str(rscratch1, Address(rthread, JavaThread::exception_pc_offset()));
+#endif
+  // Clear the exception oop so GC no longer processes it as a root.
+  __ str(rscratch1, Address(rthread, JavaThread::exception_oop_offset()));
+  __ b(rscratch2);
+
+  // -------------
+  // make sure all code is generated
+  masm->flush();
+
+  _exception_blob = ExceptionBlob::create(&buffer, oop_maps, framesize_in_words);
+}
+#endif // COMPILER2_OR_JVMCI
--- /dev/null	2018-09-25 19:25:25.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/stubGenerator_aarch32.cpp	2018-09-25 19:25:25.000000000 +0300
@@ -0,0 +1,2904 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/barrierSet.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "interpreter/interpreter.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "oops/instanceOop.hpp"
+#include "oops/method.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/handles.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/thread.inline.hpp"
+#include "vm_version_aarch32.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+
+// Declaration and definition of StubGenerator (no .hpp file).
+// For a more detailed description of the stub routine structure
+// see the comment in stubRoutines.hpp
+
+#undef __
+#define __ _masm->
+#define TIMES_OOP lsl(exact_log2(4))
+
+#ifdef PRODUCT
+#define BLOCK_COMMENT(str) /* nothing */
+#else
+#define BLOCK_COMMENT(str) __ block_comment(str)
+#endif
+
+#define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
+
+// Stub Code definitions
+
+class StubGenerator: public StubCodeGenerator {
+ private:
+
+#ifdef PRODUCT
+#define inc_counter_np(counter) ((void)0)
+#else
+  void inc_counter_np_(int& counter) {
+    __ lea(rscratch2, ExternalAddress((address)&counter));
+    __ ldr(rscratch1, Address(rscratch2));
+    __ add(rscratch1, rscratch1, 1);
+    __ str(rscratch1, Address(rscratch2));
+  }
+#define inc_counter_np(counter) \
+  BLOCK_COMMENT("inc_counter " #counter); \
+  inc_counter_np_(counter);
+#endif
+
+  // Call stubs are used to call Java from C
+  //
+  // There are only four registers available to house arguments and we're expecting eight
+  // the layout will be as follows:
+
+  // c_rarg0 = call wrapper address
+  // c_rarg1 = result
+  // c_rarg2 = result type
+  // c_rarg3 = method
+  // sp -> [ entry_point
+  //         parameters -> java params
+  //         parameter size (in words)
+  //         thread] (address increasing)
+  //
+  // We don't
+  // NEW!! layout for aarch32 so that save and restore can be collapsed into a single
+  // load/store
+  // layout of saved registers now is
+  // 0   [ saved lr      ] <- rfp
+  // -1  [ saved fp      ]
+  // -2  [ r12/rthread   ] Thread passed in args
+  // -3  [ r10/rmethod   ] NOTE omitted rfp as restored automatically
+  // -4  [ r9/rscratch1  ] Platform register?
+  // -5  [ r8/thread     ]
+  // -6  [ r7/rcpool     ]
+  // -7  [ r6/rlocals    ]
+  // -8  [ r5/rbcp       ]
+  // -9  [ r4/rdispatch  ]
+  // -10 [ r2/res type   ]
+  // -11 [ r1/result     ]
+  // -12 [r0/call wrapper]<- sp (when restored from fp value)
+  // -13 maybe alignment
+  // -YY [ java arg0     ]
+  //   ...
+  // -xx [ java argn     ] <- sp on branch into java
+  //
+  // XXX Note we do not save floating point registers
+  // Only floating point registers s16-31 / d8-15 need to be saved
+  // these are never touched by template interpreted code.
+  // On a sequence such as C -> Java -> C, the C functions will save them if used.
+
+  address generate_call_stub(address& return_address) {
+    /*assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
+           (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
+           "adjust this code");*/
+    const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize;
+
+    StubCodeMark mark(this, "StubRoutines", "call_stub");
+    address start = __ pc();
+    __ reg_printf("entering call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr);
+    __ enter(VMFrameAPCS); //save rfp & lr and possibly another 2 words
+
+    const int entry_point_arg_off = 1 * wordSize,
+              params_arg_off      = 2 * wordSize,
+              param_sz_arg_off    = 3 * wordSize,
+              thread_arg_off      = 4 * wordSize;
+    // r12 is a scratch register so we can clobber it to save thread
+    // which is needed at the end
+    __ ldr(r12, Address(rfp, thread_arg_off));
+    // r0, r1, r2, r4 - r10, r12
+    // we save r0 as the call_wrapper_address is needed elsewhere
+    // we save r1, r2 as they hold the result and it's type,
+    // which are needed on return
+    // r12 holds the thread ptr
+    unsigned c_save_regset = 0b0001011111110111;
+    int nsaved = __ count_bits(c_save_regset);
+    __ stmdb(sp, c_save_regset);
+
+    // Offset from rfp to end of stack.
+    const int rfp_tos_offset_bytes = frame::get_offset_from_rfp_bytes() + nsaved * wordSize;
+
+    // install Java thread in global register now we have saved
+    // whatever value it held
+    __ mov(rthread, r12);
+    // And method
+    __ mov(rmethod, c_rarg3);
+
+#ifdef ASSERT
+    // make sure we have no pending exceptions
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
+      __ cmp(rscratch1, (unsigned)NULL_WORD);
+      __ b(L, Assembler::EQ);
+      __ stop("StubRoutines::call_stub: entered with pending exception");
+      __ BIND(L);
+    }
+#endif
+    __ ldr(rscratch2, Address(rfp, param_sz_arg_off));
+    // align sp at the time we call java
+    __ sub(sp, sp, rscratch2, lsl(LogBytesPerWord));
+    __ align_stack();
+    __ add(sp, sp, rscratch2, lsl(LogBytesPerWord));
+
+    __ ldr(rscratch1, Address(rfp, params_arg_off));
+
+    BLOCK_COMMENT("pass parameters if any");
+    Label parameters_done;
+
+    __ reg_printf("call_stub param_off = %p, param_sz = %d\n", rscratch1, rscratch2);
+    __ cmp(rscratch2, 0);
+    __ b(parameters_done, Assembler::EQ);
+
+    // r14 makes ok temp as already saved in frame header
+    address loop = __ pc();
+    __ ldr(r14, Address(__ post(rscratch1, wordSize)));
+    __ subs(rscratch2, rscratch2, 1);
+
+    // TODO remove
+    __ reg_printf("\tARG SP[%d] : 0x%08x\n", rscratch2, r14);
+    __ cmp(rscratch2, 0);
+    // END TODO
+    __ push(r14);
+    __ b(loop, Assembler::GT);
+
+    __ BIND(parameters_done);
+
+#ifdef ASSERT
+    __ verify_stack_alignment();
+#endif
+
+    BLOCK_COMMENT("call Java function");
+    __ ldr(rscratch1, Address(rfp, entry_point_arg_off));
+    __ reg_printf("Calling Java function with rfp = %p, sp = %p\n", rfp, sp);
+    __ mov(r4, sp);                 // set sender sp
+    __ bl(rscratch1);
+    // save current address for use by exception handling code
+    return_address = __ pc();
+
+    __ reg_printf("Returned to call_stub with rfp = %p, sp = %p\n", rfp, sp);
+
+    // At this point rfp should be restored to the value it was set to before
+    // use it to set the top of stack.
+    __ sub(sp, rfp, rfp_tos_offset_bytes);
+
+#ifdef ASSERT
+    // verify that threads correspond
+    __ ldr(r12, Address(rfp, thread_off));
+    //rfp points to register stored in highest memory location - first on
+    // stack, that's the saved lr, r12 is just below that
+    // stored in r12 at this point
+    {
+      Label L, S;
+      __ cmp(rthread, r12);
+      __ b(S, Assembler::NE);
+      __ get_thread(r12);
+      __ cmp(rthread, r12);
+      __ b(L, Assembler::EQ);
+      __ BIND(S);
+      __ stop("StubRoutines::call_stub: threads must correspond");
+      __ BIND(L);
+    }
+#endif
+
+    if(MacroAssembler::enable_debugging_static) {
+      // FIXME Remove this hacky debugging code
+      Label L;
+      __ ldr(rscratch2, Address(rthread, Thread::pending_exception_offset()));
+      __ cbnz(rscratch2, L);
+      // If we're returning via an exception then we shouldn't report exit,
+      // the exception handler will have already reported the exit and reporting
+      // via our progress through the call stub will result in an extra method
+      // being reported as exited.
+      __ print_method_exit();
+      __ bind(L);
+    }
+
+    // NOTE Horrible tricks here
+    // We need to preserve current r0 and r1 values as they contain the return value.
+    // First we discard r0 saved to stack, no longer needed.
+    // We have saved result and type as c_rarg1 and c_rarg2, so now we alter
+    // the regset to load as follows:
+    // c_rarg2 = result
+    // c_rarg3 = result_type
+
+    assert((c_save_regset & 0xf) == 0b0111, "change me");
+    __ add(sp, sp, wordSize);
+    const int altered_saved_regset = (~0xf & c_save_regset) | 0xc;
+    __ ldmia(sp, altered_saved_regset);
+
+    // store result depending on type (everything that is not
+    // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
+    // n.b. this assumes Java returns an integral result in r0
+    // and a floating result in j_farg0
+
+    Label is_object, is_long, is_float, is_double, exit;
+    __ cmp(c_rarg3, T_OBJECT);
+    __ b(is_object, Assembler::EQ);
+    __ cmp(c_rarg3, T_LONG);
+    __ b(is_long, Assembler::EQ);
+    if(hasFPU()) {
+      // soft FP fall through T_INT case
+      __ cmp(c_rarg3, T_FLOAT);
+      __ b(is_float, Assembler::EQ);
+    }
+    __ cmp(c_rarg3, T_DOUBLE);
+    if(hasFPU()) {
+      __ b(is_double, Assembler::EQ);
+    } else {
+      __ b(is_long, Assembler::EQ);
+    }
+
+    // handle T_INT case
+    __ str(r0, Address(c_rarg2));
+
+    __ BIND(exit);
+    __ leave(VMFrameAPCS); //Restore rfp, sp, lr
+    __ reg_printf("leaving call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr);
+    // Pop arguments from stack.
+    //__ add(sp, sp, 4 * wordSize);
+
+    __ b(lr);
+
+    // handle return types different from T_INT
+    __ BIND(is_object);
+    __ mov(r1, 0);
+
+    __ BIND(is_long);
+    __ strd(r0, r1, Address(c_rarg2, 0));
+    __ b(exit, Assembler::AL);
+
+    if(hasFPU()) {
+      __ BIND(is_float);
+      __ vstr_f32(f0, Address(c_rarg2, 0));
+      __ b(exit, Assembler::AL);
+
+      __ BIND(is_double);
+      __ vstr_f64(d0, Address(c_rarg2, 0));
+      __ b(exit, Assembler::AL);
+    }
+    return start;
+  }
+
+  // Return point for a Java call if there's an exception thrown in
+  // Java code.  The exception is caught and transformed into a
+  // pending exception stored in JavaThread that can be tested from
+  // within the VM.
+  //
+  // Note: Usually the parameters are removed by the callee. In case
+  // of an exception crossing an activation frame boundary, that is
+  // not the case if the callee is compiled code => need to setup the
+  // rsp.
+  //
+  // r0: exception oop
+
+  // NOTE: this is used as a target from the signal handler so it
+  // needs an x86 prolog which returns into the current simulator
+  // executing the generated catch_exception code. so the prolog
+  // needs to install rax in a sim register and adjust the sim's
+  // restart pc to enter the generated code at the start position
+  // then return from native to simulated execution.
+
+  address generate_catch_exception() {
+    const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize;
+
+    StubCodeMark mark(this, "StubRoutines", "catch_exception");
+    address start = __ pc();
+
+    // same as in generate_call_stub():
+    const Address thread(rfp, thread_off);
+
+#ifdef ASSERT
+    // verify that threads correspond
+    {
+      Label L, S;
+      __ ldr(rscratch1, thread);
+      __ cmp(rthread, rscratch1);
+      __ b(S, Assembler::NE);
+      __ get_thread(rscratch1);
+      __ cmp(rthread, rscratch1);
+      __ b(L, Assembler::EQ);
+      __ bind(S);
+      __ stop("StubRoutines::catch_exception: threads must correspond");
+      __ bind(L);
+    }
+#endif
+
+    // set pending exception
+    __ verify_oop(r0);
+
+    __ str(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ mov(rscratch1, (address)__FILE__);
+    __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
+    __ mov(rscratch1, (int)__LINE__);
+    __ str(rscratch1, Address(rthread, Thread::exception_line_offset()));
+
+    // complete return to VM
+    assert(StubRoutines::_call_stub_return_address != NULL,
+           "_call_stub_return_address must have been generated before");
+    __ b(StubRoutines::_call_stub_return_address);
+
+    return start;
+  }
+
+  // Continuation point for runtime calls returning with a pending
+  // exception.  The pending exception check happened in the runtime
+  // or native call stub.  The pending exception in Thread is
+  // converted into a Java-level exception.
+  //
+  // Contract with Java-level exception handlers:
+  // r0: exception
+  // r3: throwing pc
+  //
+  // NOTE: At entry of this stub, exception-pc must be in LR !!
+
+  // NOTE: this is always used as a jump target within generated code
+  // so it just needs to be generated code wiht no x86 prolog
+
+  address generate_forward_exception() {
+    //FIXME NOTE ON ALTERATION TO ARM32 IT WAS ASSUMED THAT rmethod
+    // won't be used anymore and set on entry to the handler - is this true?
+
+    Register spare = rmethod;
+
+    StubCodeMark mark(this, "StubRoutines", "forward exception");
+    address start = __ pc();
+
+    // Upon entry, LR points to the return address returning into
+    // Java (interpreted or compiled) code; i.e., the return address
+    // becomes the throwing pc.
+    //
+    // Arguments pushed before the runtime call are still on the stack
+    // but the exception handler will reset the stack pointer ->
+    // ignore them.  A potential result in registers can be ignored as
+    // well.
+
+#ifdef ASSERT
+    // make sure this code is only executed if there is a pending exception
+    {
+      Label L;
+      __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+      __ cbnz(rscratch1, L);
+      __ stop("StubRoutines::forward exception: no pending exception (1)");
+      __ bind(L);
+    }
+#endif
+
+    // compute exception handler into r2
+
+    // call the VM to find the handler address associated with the
+    // caller address. pass thread in r0 and caller pc (ret address)
+    // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
+    // the stack.
+    __ mov(c_rarg1, lr);
+    // lr will be trashed by the VM call so we move it to R2
+    // (callee-saved) because we also need to pass it to the handler
+    // returned by this call.
+    __ mov(spare, lr); //note rscratch1 is a callee saved register
+    BLOCK_COMMENT("call exception_handler_for_return_address");
+    __ call_VM_leaf(CAST_FROM_FN_PTR(address,
+                         SharedRuntime::exception_handler_for_return_address),
+                    rthread, c_rarg1);
+    // we should not really care that lr is no longer the callee
+    // address. we saved the value the handler needs in spare so we can
+    // just copy it to r3. however, the C2 handler will push its own
+    // frame and then calls into the VM and the VM code asserts that
+    // the PC for the frame above the handler belongs to a compiled
+    // Java method. So, we restore lr here to satisfy that assert.
+    __ mov(lr, spare);
+    // setup r0 & r3 & clear pending exception
+    __ mov(r3, spare);
+    __ mov(spare, r0);
+    __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
+    __ mov(rscratch1, 0);
+    __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+
+#ifdef ASSERT
+    // make sure exception is set
+    {
+      Label L;
+      __ cbnz(r0, L);
+      __ stop("StubRoutines::forward exception: no pending exception (2)");
+      __ bind(L);
+    }
+#endif
+    // continue at exception handler
+    // r0: exception
+    // r3: throwing pc
+    // spare: exception handler
+
+    __ verify_oop(r0);
+    __ b(spare);
+
+    return start;
+  }
+
+  // Non-destructive plausibility checks for oops
+  //
+  // Arguments:
+  //    r0: oop to verify
+  //    rscratch1: error message
+  //
+  // Stack after saving c_rarg3:
+  //    [tos + 0]: saved c_rarg3
+  //    [tos + 1]: saved c_rarg2
+  //    [tos + 2]: saved lr
+  //    [tos + 3]: saved rscratch2
+  //    [tos + 4]: saved r1
+  //    [tos + 5]: saved r0
+  //    [tos + 6]: saved rscratch1
+  address generate_verify_oop() {
+    StubCodeMark mark(this, "StubRoutines", "verify_oop");
+    address start = __ pc();
+
+    Label exit, error;
+
+    // save c_rarg2 and c_rarg3
+    __ stmdb(sp, RegSet::of(c_rarg2, c_rarg3).bits());
+
+    __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
+    __ ldr(c_rarg3, Address(c_rarg2));
+    __ add(c_rarg3, c_rarg3, 1);
+    __ str(c_rarg3, Address(c_rarg2));
+
+    // object is in r0
+    // make sure object is 'reasonable'
+    __ cbz(r0, exit); // if obj is NULL it is OK
+
+    // Check if the oop is in the right area of memory
+    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
+    __ andr(c_rarg2, r0, c_rarg3);
+    __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
+
+    // Compare c_rarg2 and c_rarg3.  We don't use a compare
+    // instruction here because the flags register is live.
+    __ eor(c_rarg2, c_rarg2, c_rarg3);
+    __ cbnz(c_rarg2, error);
+
+    // make sure klass is 'reasonable', which is not zero.
+    __ load_klass(r0, r0);  // get klass
+    __ cbz(r0, error);      // if klass is NULL it is broken
+
+    // return if everything seems ok
+    __ bind(exit);
+
+    __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits());
+    __ b(lr);
+
+    // handle errors
+    __ bind(error);
+    __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits());
+
+    __ pusha();
+    // Save old sp
+    __ add(c_rarg2, sp, 14 * wordSize);
+    __ str(c_rarg2, Address( __ pre(sp, -wordSize)));
+    __ mov(c_rarg0, rscratch1);      // pass address of error message
+    __ mov(c_rarg1, lr);             // pass return address
+    __ mov(c_rarg2, sp);             // pass address of regs on stack
+#ifndef PRODUCT
+    assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
+#endif
+    BLOCK_COMMENT("call MacroAssembler::debug");
+    __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
+    __ bl(rscratch1);
+    __ hlt(0);
+
+    return start;
+  }
+
+  // NOTE : very strange, I changed this but I don't know why the Address:(signed extend word) was here
+  //void array_overlap_test(Label& L_no_overlap, Address sf) { __ b(L_no_overlap); }
+  void array_overlap_test(Label& L_no_overlap) { __ b(L_no_overlap); }
+  //no test being performed ?
+
+  //
+  // Small copy: less than 4 bytes.
+  //
+  // NB: Ignores all of the bits of count which represent more than 3
+  // bytes, so a caller doesn't have to mask them.
+
+  void copy_memory_small(Register s, Register d, Register count, Register tmp, bool is_aligned, int step) {
+    const int granularity = uabs(step);
+    const bool gen_always = !is_aligned || (-4 < step && step < 0);
+    Label halfword, done;
+
+    if ((granularity <= 1) || gen_always) {
+      __ tst(count, 1);
+      __ b(halfword, Assembler::EQ);
+      __ ldrb(tmp, step < 0 ? __ pre(s, -1) : __ post(s, 1));
+      __ strb(tmp, step < 0 ? __ pre(d, -1) : __ post(d, 1));
+    }
+
+    if ((granularity <= 2) || gen_always) {
+      __ bind(halfword);
+      __ tst(count, 2);
+      __ b(done, Assembler::EQ);
+      __ ldrh(tmp, step < 0 ? __ pre(s, -2) : __ post(s, 2));
+      __ strh(tmp, step < 0 ? __ pre(d, -2) : __ post(d, 2));
+    }
+
+    __ bind(done);
+  }
+
+  void copy_memory_simd(Register s, Register d,
+                   Register count, Register tmp, int step,
+                   DoubleFloatRegSet tmp_set, size_t tmp_set_size ) {
+    assert(UseSIMDForMemoryOps, "should be available");
+    Label simd_loop, simd_small;
+
+    __ cmp(count, tmp_set_size);
+    __ b(simd_small, Assembler::LT);
+
+    __ mov(tmp, count, __ lsr(exact_log2(tmp_set_size)));
+    __ sub(count, count, tmp, __ lsl(exact_log2(tmp_set_size)));
+
+    __ bind(simd_loop);
+
+    __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size));
+
+    if (step < 0) {
+      __ vldmdb_f64(s, tmp_set.bits());
+      __ vstmdb_f64(d, tmp_set.bits());
+    } else {
+      __ vldmia_f64(s, tmp_set.bits());
+      __ vstmia_f64(d, tmp_set.bits());
+    }
+
+    __ subs(tmp, tmp, 1);
+    __ b(simd_loop, Assembler::NE);
+
+    __ bind(simd_small);
+  }
+
+  // All-singing all-dancing memory copy.
+  //
+  // Copy count units of memory from s to d.  The size of a unit is
+  // step, which can be positive or negative depending on the direction
+  // of copy.  If is_aligned is false, we align the source address.
+  //
+
+  void copy_memory(bool is_aligned, Register s, Register d,
+                   Register count, int step) {
+    const int small_copy_size = 32; // 1 copy by ldm pays off alignment efforts and push/pop of temp set
+    const int granularity = uabs(step);
+    const Register tmp2 = rscratch2;
+    const Register t0 = r3;
+    Label small;
+
+    assert_different_registers(s, d, count, tmp2, t0);
+
+    __ mov(count, count, __ lsl(exact_log2(granularity)));
+
+    if (step < 0) {
+      __ add(s, s, count);
+      __ add(d, d, count);
+    }
+
+    __ cmp(count, small_copy_size);
+    __ b(small, Assembler::LT);
+
+    // aligning
+    if (!is_aligned || (-4 < step && step < 0)) {
+      assert(3 <= small_copy_size, "may copy number of bytes required for alignment");
+      if (step < 0) {
+        __ andr(tmp2, s, 3);
+      } else {
+        __ rsb(tmp2, s, 0);
+        __ andr(tmp2, tmp2, 3);
+      }
+      __ sub(count, count, tmp2);
+      copy_memory_small(s, d, tmp2, t0, is_aligned, step);
+    }
+
+#ifdef ASSERT
+    Label src_aligned;
+    __ tst(s, 3);
+    __ b(src_aligned, Assembler::EQ);
+    __ stop("src is not aligned");
+    __ bind(src_aligned);
+#endif
+
+    // if destination is unaliged, copying by words is the only option
+    __ tst(d, 3);
+    __ b(small, Assembler::NE);
+    if (UseSIMDForMemoryOps && (VM_Version::features() & FT_AdvSIMD)) {
+      copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d7), 64);
+      copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d1), 16);
+    } else {
+      const RegSet tmp_set = RegSet::range(r4, r7);
+      const int tmp_set_size = 16;
+      Label ldm_loop;
+
+      assert_different_registers(s, d, count, tmp2, r4, r5, r6, r7);
+
+      __ cmp(count, tmp_set_size);
+      __ b(small, Assembler::LT);
+
+      __ push(tmp_set, sp);
+
+      __ mov(tmp2, count, __ lsr(exact_log2(tmp_set_size)));
+      __ sub(count, count, tmp2, __ lsl(exact_log2(tmp_set_size)));
+
+      __ bind(ldm_loop);
+
+      __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size));
+
+      if (step < 0) {
+        __ ldmdb(s, tmp_set.bits());
+        __ stmdb(d, tmp_set.bits());
+      } else {
+        __ ldmia(s, tmp_set.bits());
+        __ stmia(d, tmp_set.bits());
+      }
+
+      __ subs(tmp2, tmp2, 1);
+      __ b(ldm_loop, Assembler::NE);
+
+      __ pop(tmp_set, sp);
+    }
+
+    __ bind(small);
+
+    Label words_loop, words_done;
+    __ cmp(count, BytesPerWord);
+    __ b(words_done, Assembler::LT);
+
+    __ mov(tmp2, count, __ lsr(exact_log2(BytesPerWord)));
+    __ sub(count, count, tmp2, __ lsl(exact_log2(BytesPerWord)));
+
+    __ bind(words_loop);
+
+    Address src = step < 0 ? __ pre(s, -BytesPerWord) : __ post(s, BytesPerWord);
+    Address dst = step < 0 ? __ pre(d, -BytesPerWord) : __ post(d, BytesPerWord);
+
+    __ pld(Address(s, step < 0 ? -2 * BytesPerWord : BytesPerWord));
+    __ ldr(t0, src);
+    __ str(t0, dst);
+    __ subs(tmp2, tmp2, 1);
+
+    __ b(words_loop, Assembler::NE);
+
+    __ bind(words_done);
+    copy_memory_small(s, d, count, t0, is_aligned, step);
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 4-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  // Side Effects:
+  //   disjoint_int_copy_entry is set to the no-overlap entry point
+  //   used by generate_conjoint_int_oop_copy().
+  //
+  address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
+                                  const char *name, bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+    if (entry != NULL) {
+      *entry = __ pc();
+      // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
+      BLOCK_COMMENT("Entry:");
+    }
+    __ enter(VMFrameAPCS);
+
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
+    if (dest_uninitialized) {
+      decorators |= IS_DEST_UNINITIALIZED;
+    }
+    if (aligned) {
+      decorators |= ARRAYCOPY_ALIGNED;
+    }
+
+    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+    bs->arraycopy_prologue(_masm, decorators, is_oop, d, count);
+
+    if (is_oop) {
+      __ push(RegSet::of(d, count), sp);
+    }
+
+    // copy memory likes to voluntary use rscratch2 and r3
+    copy_memory(aligned, s, d, count, size);
+
+    if (is_oop) {
+      __ pop(RegSet::of(d, count), sp);
+      __ sub(count, count, 1); // make an inclusive end pointer
+      __ lea(count, Address(d, count, lsl(exact_log2(size))));
+    }
+
+    // barriers are for oop arrays only, so don't worry about s, d and count being lost before
+    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2);
+
+    __ leave(VMFrameAPCS);
+    __ b(lr);
+    return start;
+  }
+
+  // Arguments:
+  //   aligned - true => Input and output aligned on a HeapWord == 4-byte boundary
+  //             ignored
+  //   is_oop  - true => oop array, so generate store check code
+  //   name    - stub name string
+  //
+  // Inputs:
+  //   c_rarg0   - source array address
+  //   c_rarg1   - destination array address
+  //   c_rarg2   - element count, treated as ssize_t, can be zero
+  //
+  // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
+  // the hardware handle it.  The two dwords within qwords that span
+  // cache line boundaries will still be loaded and stored atomicly.
+  //
+  address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
+                                 address *entry, const char *name,
+                                 bool dest_uninitialized = false) {
+    Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ cmp(d, s);
+    __ b(nooverlap_target, Assembler::LS);
+
+    __ enter(VMFrameAPCS);
+
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY;
+    if (dest_uninitialized) {
+      decorators |= IS_DEST_UNINITIALIZED;
+    }
+    if (aligned) {
+      decorators |= ARRAYCOPY_ALIGNED;
+    }
+
+    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+    bs->arraycopy_prologue(_masm, decorators, is_oop, d, count);
+
+    if (is_oop) {
+      __ push(RegSet::of(d, count), sp);
+    }
+
+    // copy memory likes to voluntary use rscratch2 and r3
+    copy_memory(aligned, s, d, count, -size);
+
+    if (is_oop) {
+      __ pop(RegSet::of(d, count), sp);
+      __ sub(count, count, 1); // make an inclusive end pointer
+      __ lea(count, Address(d, count, lsl(exact_log2(size))));
+    }
+
+    // barriers are for oop arrays only, so don't worry about s, d and count being lost before
+    bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2);
+
+    __ leave(VMFrameAPCS);
+    __ b(lr);
+    return start;
+  }
+
+  // Helper for generating a dynamic type check.
+  // Smashes rscratch1.
+  void generate_type_check(Register sub_klass,
+                           Register super_check_offset,
+                           Register super_klass,
+                           Label& L_success) {
+    assert_different_registers(sub_klass, super_check_offset, super_klass);
+
+    BLOCK_COMMENT("type_check:");
+
+    Label L_miss;
+
+    __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
+                                     super_check_offset);
+    __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
+
+    // Fall through on failure!
+    __ BIND(L_miss);
+  }
+
+  //
+  //  Generate checkcasting array copy stub
+  //
+  //  Input:
+  //    c_rarg0   - source array address
+  //    c_rarg1   - destination array address
+  //    c_rarg2   - element count, treated as ssize_t, can be zero
+  //    c_rarg3   - size_t ckoff (super_check_offset)
+  //    [sp]      - oop ckval (super_klass)
+  //
+  //  Output:
+  //    r0 ==  0  -  success
+  //    r0 == -1^K - failure, where K is partial transfer count
+  //
+  address generate_checkcast_copy(const char *name, address *entry,
+                                  bool dest_uninitialized = false) {
+    Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
+
+    // Input registers (after setup_arg_regs)
+    const Register from        = c_rarg0;   // source array address
+    const Register to          = c_rarg1;   // destination array address
+    const Register count       = c_rarg2;   // elementscount
+    const Register ckoff       = c_rarg3;   // super_check_offset
+
+    // Registers used as temps
+    const Register ckval       = r4;        // super_klass
+    const Register count_save  = r5;        // orig elementscount
+    const Register copied_oop  = r6;        // actual oop copied
+    const Register oop_klass   = r7;        // oop._klass
+    const Register start_to    = lr;
+
+    //---------------------------------------------------------------
+    // Assembler stub will be used for this call to arraycopy
+    // if the two arrays are subtypes of Object[] but the
+    // destination array type is not equal to or a supertype
+    // of the source type.  Each element must be separately
+    // checked.
+
+    assert_different_registers(from, to, count, ckoff, ckval,
+                               copied_oop, oop_klass, count_save);
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+    address start = __ pc();
+
+    __ enter(VMFrameAPCS); // required for proper stackwalking of RuntimeStub frame
+
+#ifdef ASSERT
+    // caller guarantees that the arrays really are different
+    // otherwise, we would have to make conjoint checks
+    { Label L;
+      array_overlap_test(L);//, TIMES_OOP);
+      __ stop("checkcast_copy within a single array");
+      __ bind(L);
+    }
+#endif //ASSERT
+
+    // Caller of this entry point must set up the argument registers.
+    if (entry != NULL) {
+      *entry = __ pc();
+      BLOCK_COMMENT("Entry:");
+    }
+
+     // Empty array:  Nothing to do.
+    __ cbz(count, L_done);
+
+    // rscratch1 used as temp, rscratch2 can be killed by inc_counter_np
+    __ push(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp);
+    __ ldr(ckval, Address(rfp, wordSize));
+
+#ifdef ASSERT
+    BLOCK_COMMENT("assert consistent ckoff/ckval");
+    // The ckoff and ckval must be mutually consistent,
+    // even though caller generates both.
+    { Label L;
+      int sco_offset = in_bytes(Klass::super_check_offset_offset());
+      __ ldr(rscratch1, Address(ckval, sco_offset));
+      __ cmp(ckoff, rscratch1);
+      __ b(L, Assembler::EQ);
+      __ stop("super_check_offset inconsistent");
+      __ bind(L);
+    }
+#endif //ASSERT
+
+    DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
+    bool is_oop = true;
+    if (dest_uninitialized) {
+      decorators |= IS_DEST_UNINITIALIZED;
+    }
+
+    BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+    bs->arraycopy_prologue(_masm, decorators, is_oop, to, count);
+
+    // save the original count
+    __ mov(count_save, count);
+
+    // save destination array start address
+    __ mov(start_to, to);
+
+    // Copy from low to high addresses
+    __ b(L_load_element);
+
+    // ======== begin loop ========
+    // (Loop is rotated; its entry is L_load_element.)
+    // Loop control:
+    //   for (; count != 0; count--) {
+    //     copied_oop = load_heap_oop(from++);
+    //     ... generate_type_check ...;
+    //     store_heap_oop(to++, copied_oop);
+    //   }
+    __ align(OptoLoopAlignment);
+
+    __ BIND(L_store_element);
+    __ store_heap_oop(__ post(to, 4), copied_oop, noreg, noreg, AS_RAW);  // store the oop
+    __ sub(count, count, 1);
+    __ cbz(count, L_do_card_marks);
+
+    // ======== loop entry is here ========
+    __ BIND(L_load_element);
+    __ load_heap_oop(copied_oop, __ post(from, 4), noreg, noreg, AS_RAW); // load the oop
+    __ cbz(copied_oop, L_store_element);
+
+    __ load_klass(oop_klass, copied_oop);// query the object klass
+    generate_type_check(oop_klass, ckoff, ckval, L_store_element);
+    // ======== end loop ========
+
+    // It was a real error; we must depend on the caller to finish the job.
+    // Register count = remaining oops, count_orig = total oops.
+    // Emit GC store barriers for the oops we have copied and report
+    // their number to the caller.
+
+    __ subs(count, count_save, count);     // K = partially copied oop count
+    __ inv(count, count);                   // report (-1^K) to caller
+    __ b(L_done_pop, Assembler::EQ);
+
+    __ BIND(L_do_card_marks);
+    __ add(to, to, -heapOopSize);         // make an inclusive end pointer
+    bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1);
+
+    __ bind(L_done_pop);
+    inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
+    __ pop(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp);
+
+    __ bind(L_done);
+    __ mov(r0, count);
+    __ leave(VMFrameAPCS);
+    __ b(lr);
+    return start;
+  }
+
+  void generate_arraycopy_stubs() {
+    address entry;
+
+    // jbyte
+    StubRoutines::_arrayof_jbyte_disjoint_arraycopy =      generate_disjoint_copy(sizeof(jbyte),  true,  false,        &entry, "arrayof_jbyte_disjoint_arraycopy");
+    StubRoutines::_arrayof_jbyte_arraycopy =               generate_conjoint_copy(sizeof(jbyte),  true,  false, entry, NULL,   "arrayof_jbyte_arraycopy");
+    StubRoutines::_jbyte_disjoint_arraycopy =              generate_disjoint_copy(sizeof(jbyte),  false, false,        &entry, "jbyte_disjoint_arraycopy");
+    StubRoutines::_jbyte_arraycopy =                       generate_conjoint_copy(sizeof(jbyte),  false, false, entry, NULL,   "jbyte_arraycopy");
+    // jshort
+    StubRoutines::_arrayof_jshort_disjoint_arraycopy =     generate_disjoint_copy(sizeof(jshort), true,  false,        &entry, "arrayof_jshort_disjoint_arraycopy");
+    StubRoutines::_arrayof_jshort_arraycopy =              generate_conjoint_copy(sizeof(jshort), true,  false, entry, NULL,   "arrayof_jshort_arraycopy");
+    StubRoutines::_jshort_disjoint_arraycopy =             generate_disjoint_copy(sizeof(jshort), false, false,        &entry, "jshort_disjoint_arraycopy");
+    StubRoutines::_jshort_arraycopy =                      generate_conjoint_copy(sizeof(jshort), false, false, entry, NULL,   "jshort_arraycopy");
+    // jint (always aligned)
+    StubRoutines::_arrayof_jint_disjoint_arraycopy =       generate_disjoint_copy(sizeof(jint),   true,  false,        &entry, "arrayof_jint_disjoint_arraycopy");
+    StubRoutines::_arrayof_jint_arraycopy =                generate_conjoint_copy(sizeof(jint),   true,  false, entry, NULL,   "arrayof_jint_arraycopy");
+    StubRoutines::_jint_disjoint_arraycopy =               StubRoutines::_arrayof_jint_disjoint_arraycopy;
+    StubRoutines::_jint_arraycopy =                        StubRoutines::_arrayof_jint_arraycopy;
+    // jlong (always aligned)
+    StubRoutines::_arrayof_jlong_disjoint_arraycopy =      generate_disjoint_copy(sizeof(jlong),  true,  false,        &entry, "arrayof_jlong_disjoint_arraycopy");
+    StubRoutines::_arrayof_jlong_arraycopy =               generate_conjoint_copy(sizeof(jlong),  true,  false, entry, NULL,   "arrayof_jlong_arraycopy");
+    StubRoutines::_jlong_disjoint_arraycopy =              StubRoutines::_arrayof_jlong_disjoint_arraycopy;
+    StubRoutines::_jlong_arraycopy =                       StubRoutines::_arrayof_jlong_arraycopy;
+    // OOP (always aligned)
+    StubRoutines::_arrayof_oop_disjoint_arraycopy =        generate_disjoint_copy(sizeof(jint),   true,  true,         &entry, "arrayof_oop_disjoint_arraycopy");
+    StubRoutines::_arrayof_oop_arraycopy =                 generate_conjoint_copy(sizeof(jint),   true,  true,  entry, NULL,   "arrayof_oop_arraycopy");
+    StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_copy(sizeof(jint),   true,  true,         &entry, "arrayof_oop_disjoint_arraycopy_uninit", true);
+    StubRoutines::_arrayof_oop_arraycopy_uninit =          generate_conjoint_copy(sizeof(jint),   true,  true,  entry, NULL,   "arrayof_oop_arraycopy_uninit",          true);
+    StubRoutines::_oop_disjoint_arraycopy =                StubRoutines::_arrayof_oop_disjoint_arraycopy;
+    StubRoutines::_oop_arraycopy =                         StubRoutines::_arrayof_oop_arraycopy;
+    StubRoutines::_oop_disjoint_arraycopy_uninit =         StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
+    StubRoutines::_oop_arraycopy_uninit =                  StubRoutines::_arrayof_oop_arraycopy_uninit;
+
+    StubRoutines::_checkcast_arraycopy =        generate_checkcast_copy("checkcast_arraycopy",        NULL);
+    StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, true);
+  }
+
+  void generate_math_stubs() { Unimplemented(); }
+
+  // Safefetch stubs.
+  void generate_safefetch(const char* name, int size, address* entry,
+                          address* fault_pc, address* continuation_pc) {
+    // safefetch signatures:
+    //   int      SafeFetch32(int*      adr, int      errValue);
+    //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
+    //
+    // arguments:
+    //   c_rarg0 = adr
+    //   c_rarg1 = errValue
+    //
+    // result:
+    //   PPC_RET  = *adr or errValue
+
+    StubCodeMark mark(this, "StubRoutines", name);
+
+    // Entry point, pc or function descriptor.
+    *entry = __ pc();
+
+    // Load *adr into c_rarg1, may fault.
+    __ mov(c_rarg2, c_rarg0);
+    *fault_pc = __ pc();
+    switch (size) {
+      case 4:
+        // int32_t
+        __ ldr(c_rarg0, Address(c_rarg2, 0));
+        break;
+      default:
+        ShouldNotReachHere();
+    }
+    __ b(lr);
+    // return errValue or *adr
+    *continuation_pc = __ pc();
+    __ mov(r0, c_rarg1);
+    __ b(lr);
+  }
+
+  /**
+   *  Arguments:
+   *
+   * Inputs:
+   *   c_rarg0   - int crc
+   *   c_rarg1   - byte* buf
+   *   c_rarg2   - int length
+   *
+   * Output:
+   *       r0   - int crc result
+   *
+   * Preserves:
+   *       r13
+   *
+   */
+  address generate_updateBytesCRC32(int is_crc32c) {
+    assert(!is_crc32c ? UseCRC32Intrinsics : UseCRC32CIntrinsics, "what are we doing here?");
+
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", !is_crc32c ? "updateBytesCRC32" : "updateBytesCRC32C");
+
+    address start = __ pc();
+
+    const Register crc   = c_rarg0;  // crc
+    const Register buf   = c_rarg1;  // source java byte array address
+    const Register len   = c_rarg2;  // length
+    const Register table0 = c_rarg3; // crc_table address
+    const Register table1 = r4;
+    const Register table2 = r5;
+    const Register table3 = lr;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp);
+
+    __ kernel_crc32(crc, buf, len,
+              table0, table1, table2, table3, rscratch1, rscratch2, r6, is_crc32c);
+
+    __ pop(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
+  /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - x address
+   *    c_rarg1   - x length
+   *    c_rarg2   - y address
+   *    c_rarg3   - y lenth
+   *    sp[0]     - z address
+   *    sp[1]     - z length
+   */
+  address generate_multiplyToLen() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
+
+    address start = __ pc();
+    const Register x     = r0;
+    const Register xlen  = r1;
+    const Register y     = r2;
+    const Register ylen  = r3;
+
+    const Register z     = r4;
+    const Register zlen  = r5;
+
+    const Register tmp1  = r6;
+    const Register tmp2  = r7;
+    const Register tmp3  = r8;
+    const Register tmp4  = r9;
+    const Register tmp5  = r12;
+    const Register tmp6  = r14;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp);
+    __ ldr(z, Address(rfp, 4));
+    __ ldr(zlen, Address(rfp, 8));
+    __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
+    __ pop(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
+  /**
+   *  Arguments:
+   *
+   *  Input:
+   *    c_rarg0   - out
+   *    c_rarg1   - int
+   *    c_rarg2   - offset
+   *    c_rarg3   - len
+   *    sp[0]     - k
+   */
+  address generate_mulAdd() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "mulAdd");
+
+    address start = __ pc();
+    const Register out    = r0;
+    const Register in     = r1;
+    const Register offset = r2;
+    const Register len    = r3;
+
+    const Register k      = r4;
+
+    const Register tmp1  = r6;
+    const Register tmp2  = r7;
+    const Register tmp3  = r8;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    __ push(RegSet::of(k, tmp1, tmp2, tmp3), sp);
+    __ ldr(k, Address(rfp, 4));
+    __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3);
+    __ pop(RegSet::of(k, tmp1, tmp2, tmp3), sp);
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+
+  address generate_aescrypt_encryptBlock() {
+    assert(UseAESIntrinsics, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
+
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register keylen = c_rarg3;
+    const Register table1 = r4;
+    const Register t0 = r5;
+    const Register t1 = r6;
+    const Register t2 = r7;
+    const Register t3 = r8;
+    const Register t4 = r9;
+    const Register t5 = r10;
+    const Register t6 = r11;
+    const Register t7 = r12;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12), sp);
+    __ kernel_aescrypt_encryptBlock(from, to, key, keylen, table1,
+            t0, t1, t2, t3, t4, t5, t6, t7);
+    __ pop(RegSet::of(r9, r10, r11, r12), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //
+
+  address generate_aescrypt_decryptBlock() {
+    assert(UseAESIntrinsics, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
+
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register keylen = c_rarg3;
+    const Register table1 = r4;
+    const Register t0 = r5;
+    const Register t1 = r6;
+    const Register t2 = r7;
+    const Register t3 = r8;
+    const Register t4 = r9;
+    const Register t5 = r10;
+    const Register t6 = r11;
+    const Register t7 = r12;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12), sp);
+    __ kernel_aescrypt_decryptBlock(from, to, key, keylen, table1,
+            t0, t1, t2, t3, t4, t5, t6, t7);
+    __ pop(RegSet::of(r9, r10, r11, r12), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+
+  address generate_cipherBlockChaining_encryptAESCrypt(bool len_on_stack) {
+    assert(UseAESIntrinsics && UseNeon, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
+
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register rvec = c_rarg3; // r byte array initialized from initvector array address
+    // and left with the results of the last encryption block
+    const Register len = r4; // src len (must be multiple of blocksize 16)
+    const Register keylen = r5;
+    const Register table = r6;
+    const Register t0 = r7;
+    const Register t1 = r8;
+    const Register t2 = r9;
+    const Register t3 = r10;
+    const Register t4 = r11;
+    const Register t5 = r12;
+    const Register t6 = lr;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12), sp);
+    __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers
+
+    if (len_on_stack)
+      __ ldr(len, Address(rfp, wordSize));
+    __ kernel_aescrypt_encrypt(from, to, key, rvec, len, keylen, table,
+            t0, t1, t2, t3, t4, t5, t6);
+
+    __ vldmia_f64(sp, 0xff00);
+    __ pop(RegSet::of(r9, r10, r11, r12), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - destination byte array address
+  //   c_rarg2   - K (key) in little endian int array
+  //   c_rarg3   - r vector byte array address
+  //   c_rarg4   - input length
+  //
+  // Output:
+  //   x0        - input length
+  //
+
+  address generate_cipherBlockChaining_decryptAESCrypt(bool len_on_stack) {
+    assert(UseAESIntrinsics && UseNeon, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
+
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register to = c_rarg1; // destination array address
+    const Register key = c_rarg2; // key array address
+    const Register rvec = c_rarg3; // r byte array initialized from initvector array address
+    // and left with the results of the last encryption block
+    const Register len = r4; // src len (must be multiple of blocksize 16)
+    const Register keylen = r5;
+    const Register table = r6;
+    const Register t0 = r7;
+    const Register t1 = r8;
+    const Register t2 = r9;
+    const Register t3 = r10;
+    const Register t4 = r11;
+    const Register t5 = r12;
+    const Register t6 = lr;
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12), sp);
+    __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers
+
+    if (len_on_stack)
+      __ ldr(len, Address(rfp, wordSize));
+    __ kernel_aescrypt_decrypt(from, to, key, rvec, len, keylen, table,
+            t0, t1, t2, t3, t4, t5, t6);
+
+    __ vldmia_f64(sp, 0xff00);
+    __ pop(RegSet::of(r9, r10, r11, r12), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - state array
+
+  address generate_sha_implCompress() {
+    assert(UseSHA1Intrinsics, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "sha_implCompress");
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register state = c_rarg1;   // state array address
+    const Register t0 = c_rarg2;
+    const Register t1 = c_rarg3;
+    const Register t2 = r4;
+    const Register t3 = r5;
+    const Register t4 = r6;
+    const Register t5 = r7;
+    const Register t6 = r8;
+    const Register t7 = r9;
+    const Register t8 = r10;
+    const Register t9 = r11;
+    const Register t10 = r12;
+    DoubleFloatRegSet _fToSave = DoubleFloatRegSet::range(d0, d15);
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12), sp);
+    __ vstmdb_f64(sp, _fToSave.bits());
+
+    __ kernel_sha_implCompress(from, state, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10);
+
+    __ vldmia_f64(sp, _fToSave.bits(), true);
+    __ pop(RegSet::of(r9, r10, r11, r12), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - state array
+
+  address generate_sha256_implCompress() {
+    assert(UseSHA256Intrinsics, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "sha256_implCompress");
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register state = c_rarg1;   // state array address
+    const Register t0 = c_rarg2;
+    const Register t1 = c_rarg3;
+    const Register t2 = r4;
+    const Register t3 = r5;
+    const Register t4 = r6;
+    const Register t5 = r7;
+    const Register t6 = r8;
+    const Register t7 = r9;
+    const Register t8 = r10;
+    const Register t9 = r11;
+    const Register t10 = r12;
+    const Register t11 = lr;
+    DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15);
+    DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31);
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
+    __ push(RegSet::of(r9, r10, r11, r12, lr), sp);
+    __ vstmdb_f64(sp, _fToSave1.bits());
+    __ vstmdb_f64(sp, _fToSave2.bits());
+
+    __ kernel_sha256_implCompress(from, state, t0, t1,
+            t2, t3, t4, t5, t6, t7, t8, t9, t10, t11);
+
+    __ vldmia_f64(sp, _fToSave2.bits(), true);
+    __ vldmia_f64(sp, _fToSave1.bits(), true);
+    __ pop(RegSet::of(r9, r10, r11, r12, lr), sp);
+    __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Arguments:
+  //
+  // Inputs:
+  //   c_rarg0   - source byte array address
+  //   c_rarg1   - state array
+
+  address generate_sha512_implCompress() {
+    assert(UseSHA512Intrinsics, "what are we doing here?");
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "sha512_implCompress");
+    address start = __ pc();
+
+    const Register from = c_rarg0; // source array address
+    const Register state = c_rarg1;   // state array address
+    const Register t0 = c_rarg2;
+    const Register t1 = c_rarg3;
+    DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15);
+    DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31);
+
+
+    BLOCK_COMMENT("Entry:");
+    __ enter();
+
+    __ vstmdb_f64(sp, _fToSave1.bits());
+    __ vstmdb_f64(sp, _fToSave2.bits());
+
+    __ kernel_sha512_implCompress(from, state, t0, t1);
+
+    __ vldmia_f64(sp, _fToSave2.bits(), true);
+    __ vldmia_f64(sp, _fToSave1.bits(), true);
+
+    __ leave();
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Continuation point for throwing of implicit exceptions that are
+  // not handled in the current activation. Fabricates an exception
+  // oop and initiates normal exception dispatching in this
+  // frame. Since we need to preserve callee-saved values (currently
+  // only for C2, but done for C1 as well) we need a callee-saved oop
+  // map and therefore have to make these stubs into RuntimeStubs
+  // rather than BufferBlobs.  If the compiler needs all registers to
+  // be preserved between the fault point and the exception handler
+  // then it must assume responsibility for that in
+  // AbstractCompiler::continuation_for_implicit_null_exception or
+  // continuation_for_implicit_division_by_zero_exception. All other
+  // implicit exceptions (e.g., NullPointerException or
+  // AbstractMethodError on entry) are either at call sites or
+  // otherwise assume that stack unwinding will be initiated, so
+  // caller saved registers were assumed volatile in the compiler.
+
+#undef __
+#define __ masm->
+
+  address generate_throw_exception(const char* name,
+                                   address runtime_entry,
+                                   Register arg1 = noreg,
+                                   Register arg2 = noreg) {
+    // Information about frame layout at time of blocking runtime call.
+    // Note that we only have to preserve callee-saved registers since
+    // the compilers are responsible for supplying a continuation point
+    // if they expect all registers to be preserved.
+    // n.b. aarch32 asserts that frame::arg_reg_save_area_bytes == 0
+    const int framesize = frame::get_frame_size();
+    const int insts_size = 512;
+    const int locs_size  = 64;
+
+    CodeBuffer code(name, insts_size, locs_size);
+    OopMapSet* oop_maps  = new OopMapSet();
+    MacroAssembler* masm = new MacroAssembler(&code);
+
+    address start = __ pc();
+
+    // This is an inlined and slightly modified version of call_VM
+    // which has the ability to fetch the return PC out of
+    // thread-local storage and also sets up last_Java_sp slightly
+    // differently than the real call_VM
+
+    __ enter(); // Save at least FP and LR before call
+
+    assert(is_even(framesize), "sp not 8-byte aligned");
+
+    int frame_complete = __ pc() - start;
+
+    // Set up last_Java_sp and last_Java_fp
+    address the_pc = __ pc();
+    __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
+
+    // Call runtime
+    if (arg1 != noreg) {
+      assert(arg2 != c_rarg1, "clobbered");
+      __ mov(c_rarg1, arg1);
+    }
+    if (arg2 != noreg) {
+      __ mov(c_rarg2, arg2);
+    }
+    __ mov(c_rarg0, rthread);
+    BLOCK_COMMENT("call runtime_entry");
+    __ align_stack();
+    __ mov(rscratch1, runtime_entry);
+    __ bl(rscratch1);
+
+    // Generate oop map
+    OopMap* map = new OopMap(framesize, 0);
+
+    oop_maps->add_gc_map(the_pc - start, map);
+
+    __ reset_last_Java_frame(true);
+    __ maybe_isb();
+
+    __ leave();
+
+    // check for pending exceptions
+#ifdef ASSERT
+    Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbnz(rscratch1, L);
+    __ should_not_reach_here();
+    __ bind(L);
+#endif // ASSERT
+    __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
+
+
+    // codeBlob framesize is in words (not VMRegImpl::slot_size)
+    RuntimeStub* stub =
+      RuntimeStub::new_runtime_stub(name,
+                                    &code,
+                                    frame_complete,
+                                    framesize,
+                                    oop_maps, false);
+    return stub->entry_point();
+  }
+
+  class MontgomeryMultiplyGenerator : public MacroAssembler {
+
+    Register Pa_base, Pb_base, Pn_base, Pm_base, Rlen, Ri, Rj, Pa, Pb, Pn, Pm;
+    FloatRegister inv, Ra, Rb, Rm, Rn, RabAB, RaBAb, s0, s1, s2, tmp;
+
+    RegSet _toSave;
+    DoubleFloatRegSet _fToSave;
+    bool _squaring;
+
+  public:
+    MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
+      : MacroAssembler(as->code()), _squaring(squaring) {
+
+      // Register allocation
+
+      Register reg = c_rarg0;
+
+      Pa_base = reg++;       // Argument registers
+      if (squaring)
+        Pb_base = Pa_base;
+      else
+        Pb_base = reg++;
+      Pn_base = reg++;
+      Rlen= reg++;
+      Pm_base = r4;
+
+      Ri =  r5;        // Inner and outer loop indexes.
+      Rj =  r6;
+
+      Pa =  r7;        // Pointers to the current/next digit of a, b, n, and m.
+      Pb =  r8;
+      Pm =  r9;
+      Pn =  r12;
+
+      _toSave = RegSet::range(r4, r8) + RegSet::of(r9, r12);
+
+      // Now NEON registers
+
+      // Working registers:
+      Ra =  d0;        // The current digit of a, b, n, and m.
+      Rb =  d1;        // The values are stored as read, that is high and
+      Rm =  d2;        // low 32-bit parts are exchanged
+      Rn =  d3;
+
+      // Three registers which form a triple-precision accumulator.
+      // For sake of performance these are 128-bit and are overlapping
+      //  (hence the name is s, not t). The schema is the following:
+      //         w4|w3|w2|w1|w0| (32-bit words)
+      // s0 lo:          |**|**|
+      // s0 hi:       |**|**|
+      // s1 lo:       |**|**|
+      // s1 hi:    |**|**|
+      // s2 lo:    |**|**|
+      // s2 hi: |**|**|
+      // the idea is that each of 64-bit s registers accumulate only 32-bit
+      // numbers and hence never needs carry operation
+
+      s0 =  q2;
+      s1 =  q3;
+      s2 =  q4;
+
+      RabAB = q5;      // Product registers: low, high and middle parts
+      RaBAb = q6;      // of a*b and m*n. hi(A)*hi(B) is the same quad as lo(a)*lo(b)
+
+      inv = d14;
+      tmp = d15;
+
+      _fToSave = DoubleFloatRegSet::range(d8, tmp);
+    }
+
+  private:
+    void save_regs() {
+      vstmdb_f64(sp, _fToSave.bits());
+      push(_toSave, sp);
+    }
+
+    void restore_regs() {
+      pop(_toSave, sp);
+      vldmia_f64(sp, _fToSave.bits(), true);
+    }
+
+    template <typename T>
+    void unroll_2(Register count, T block) {
+      Label loop, end, odd;
+      tbnz(count, 0, odd);
+      cbz(count, end);
+      align(16);
+      bind(loop);
+      (this->*block)();
+      bind(odd);
+      (this->*block)();
+      subs(count, count, 2);
+      b(loop, Assembler::GT);
+      bind(end);
+    }
+
+    void pre1(Register i) {
+      block_comment("pre1");
+      // Pa = Pa_base;
+      // Pb = Pb_base + i;
+      // Pm = Pm_base;
+      // Pn = Pn_base + i;
+      // Ra = *Pa;
+      // Rb = *Pb;
+      // Rm = *Pm;
+      // Rn = *Pn;
+      lea(Pa, Address(Pa_base));
+      lea(Pb, Address(Pb_base, i, lsl(LogBytesPerLong), Address::SUB));
+      lea(Pm, Address(Pm_base));
+      lea(Pn, Address(Pn_base, i, lsl(LogBytesPerLong), Address::SUB));
+
+      vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD);
+      vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD);
+      vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
+      vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
+    }
+
+    // The core multiply-accumulate step of a Montgomery
+    // multiplication.  The idea is to schedule operations as a
+    // pipeline so that instructions with long latencies (loads and
+    // multiplies) have time to complete before their results are
+    // used.  This most benefits in-order implementations of the
+    // architecture but out-of-order ones also benefit.
+    void step() {
+      block_comment("step");
+      // MACC(Rm, Rn, t0, t1, t2);
+      // Rm = *++Pm;
+      // Rn = *--Pn;
+      sub(Pm, Pm, BytesPerLong);
+      add(Pn, Pn, BytesPerLong);
+      vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
+      vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
+      vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
+      vmul_acc2(tmp, RabAB, RaBAb);
+
+      // MACC(Ra, Rb, t0, t1, t2);
+      // Ra = *++Pa;
+      // Rb = *--Pb;
+      sub(Pa, Pa, BytesPerLong);
+      add(Pb, Pb, BytesPerLong);
+      vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
+      vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD);
+      vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD);
+      vmul_acc2(tmp, RabAB, RaBAb);
+    }
+
+    void post1() {
+      FloatRegister t0 = RabAB;
+
+      block_comment("post1");
+
+      // MACC(Ra, Rb, t0, t1, t2);
+      vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
+      vmul_acc2(tmp, RabAB, RaBAb);
+
+      // *Pm = Rm = t0 * inv;
+      vmul_fin(t0, tmp);
+      vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp
+      vrev64_64_32(Rm, Rm);         // write in reversed, big-endian format
+      vst1_64(Rm, Address(Pm), ALIGN_STD);
+
+      // MACC(Rm, Rn, t0, t1, t2);
+      vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
+      vmul_acc2(tmp, RabAB, RaBAb);
+
+#ifndef PRODUCT
+      // assert(t0 == 0, "broken Montgomery multiply");
+      {
+        vmul_fin(t0, tmp);
+        Label ok;
+        push(RegSet::of(Ri, Rj), sp);
+        vmov_f64(Ri, Rj, t0);
+        orr(Ri, Ri, Rj);
+        cbz(Ri, ok); {
+          stop("broken Montgomery multiply");
+        } bind(ok);
+        pop(RegSet::of(Ri, Rj), sp);
+      }
+#endif
+
+      // t0 = t1; t1 = t2; t2 = 0;
+      shift_t(RabAB);
+    }
+
+    void pre2(Register i, Register len) {
+      block_comment("pre2");
+      // Pa = Pa_base + i-len;
+      // Pb = Pb_base + len;
+      // Pm = Pm_base + i-len;
+      // Pn = Pn_base + len;
+
+      // Rj == i-len
+      sub(Rj, i, len);
+
+      lea(Pa, Address(Pa_base, Rj, lsl(LogBytesPerLong), Address::SUB));
+      lea(Pb, Address(Pb_base, len, lsl(LogBytesPerLong), Address::SUB));
+      lea(Pm, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB));
+      lea(Pn, Address(Pn_base, len, lsl(LogBytesPerLong), Address::SUB));
+
+      // Ra = *++Pa;
+      // Rb = *--Pb;
+      // Rm = *++Pm;
+      // Rn = *--Pn;
+      sub(Pa, Pa, BytesPerLong);
+      add(Pb, Pb, BytesPerLong);
+      sub(Pm, Pm, BytesPerLong);
+      add(Pn, Pn, BytesPerLong);
+
+      vld1_64(Ra, Address(Pa), ALIGN_STD);
+      vld1_64(Rb, Address(Pb), ALIGN_STD);
+      vld1_64(Rm, Address(Pm), ALIGN_STD);
+      vld1_64(Rn, Address(Pn), ALIGN_STD);
+    }
+
+    void post2(Register i, Register len) {
+      FloatRegister t0 = RabAB;
+
+      block_comment("post2");
+
+      vmul_fin(t0, tmp);
+
+      // As soon as we know the least significant digit of our result,
+      // store it.
+      // Pm_base[i-len] = t0;
+      sub(Rj, i, len);
+      lea(Rj, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB));
+      vrev64_64_32(t0, t0);
+      vst1_64(t0, Address(Rj), ALIGN_STD);
+
+      // t0 = t1; t1 = t2; t2 = 0;
+      shift_t(RabAB);
+    }
+
+    // A carry in t0 after Montgomery multiplication means that we
+    // should subtract multiples of n from our result in m.  We'll
+    // keep doing that until there is no carry. ARM registers are used
+    // for this operation, this is faster than using NEON
+    void normalize(Register len, Register t0lo, Register t0hi,
+        Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
+      block_comment("normalize");
+      // while (t0)
+      //   t0 = sub(Pm_base, Pn_base, t0, len);
+      Label loop, post, again;
+      Register cnt = tmp1, i = tmp2, m = tmp3, n = tmp4, flags = tmp5;
+      // let them point to last 32-bit element now
+      add(Pn_base, Pn_base, BytesPerInt);
+      add(Pm_base, Pm_base, BytesPerInt);
+      orrs(n, t0lo, t0hi);
+      b(post, EQ); {
+        bind(again); {
+          mov(i, 0);
+          mov(cnt, len); // each loop processes 64 bits
+          ldr(m, Address(Pm_base));
+          ldr(n, Address(Pn_base));
+          cmp(n, n); // set carry flag, i.e. no borrow
+          mrs(flags);
+          align(16);
+          bind(loop); {
+            msr(flags, true, false);
+            sbcs(m, m, n);
+            str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
+            add(i, i, 1);
+            ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB));
+            ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
+            sbcs(m, m, n);
+            mrs(flags);
+            str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
+            add(i, i, 1);
+            ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB));
+            ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
+            sub(cnt, cnt, 1);
+          } cbnz(cnt, loop);
+          msr(flags, true, false);
+          sbcs(t0lo, t0lo, 0);
+          sbc(t0hi, t0hi, 0);
+          orrs(n, t0lo, t0hi);
+        } b(again, NE);
+      } bind(post);
+    }
+
+    void step_squaring() {
+      // An extra ACC for A*B
+      step();
+      vmul_acc2(tmp, RabAB, RaBAb, false);
+    }
+
+    void last_squaring(Register i) {
+      Label dont;
+      // if ((i & 1) == 0) {
+      tbnz(i, 0, dont); {
+        // MACC(Ra, Rb, t0, t1, t2);
+        // Ra = *++Pa;
+        // Rb = *--Pb;
+        sub(Pa, Pa, BytesPerLong);
+        add(Pb, Pb, BytesPerLong);
+        vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
+        vmul_acc2(tmp, RabAB, RaBAb);
+      } bind(dont);
+    }
+
+    void extra_step_squaring() {
+      // MACC(Rm, Rn, t0, t1, t2);
+      // Rm = *++Pm;
+      // Rn = *--Pn;
+      sub(Pm, Pm, BytesPerLong);
+      add(Pn, Pn, BytesPerLong);
+      vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
+      vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
+      vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
+      vmul_acc2(tmp, RabAB, RaBAb);
+    }
+
+    void post1_squaring() {
+      FloatRegister t0 = RabAB;
+
+      // *Pm = Rm = t0 * inv;
+      vmul_fin(t0, tmp);
+      vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp
+      vrev64_64_32(Rm, Rm);
+      vst1_64(Rm, Address(Pm), ALIGN_STD);
+
+      // MACC(Rm, Rn, t0, t1, t2);
+      vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
+      vmul_acc2(tmp, RabAB, RaBAb);
+
+#ifndef PRODUCT
+      // assert(t0 == 0, "broken Montgomery multiply");
+      {
+        vmul_fin(t0, tmp);
+        Label ok;
+        push(RegSet::of(Ri, Rj), sp);
+        vmov_f64(Ri, Rj, t0);
+        orr(Ri, Ri, Rj);
+        cbz(Ri, ok); {
+          stop("broken Montgomery square");
+        } bind(ok);
+        pop(RegSet::of(Ri, Rj), sp);
+      }
+#endif
+
+      // t0 = t1; t1 = t2; t2 = 0;
+      shift_t(RabAB);
+    }
+
+    /**
+     * Initializes the accumulators
+     */
+    void vmul_init() {
+      vmov_128_32(s0, 0);
+      vmov_128_32(s1, 0);
+      vmov_128_32(s2, 0);
+    }
+
+    /**
+     * Multiplies unsigned 64-bit a by unsigned 64-bit b accumulating the
+     * result into temp array (s0-s2). temp array is not converged into
+     * resulting number. See vmul_fin.
+     * Performance critical part.
+     * @param a first operand
+     * @param b second operand
+     */
+    void vmul_acc1(FloatRegister a, FloatRegister b, FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb) {
+      vrev64_64_32(tmp, b);
+      vmull_32u(RabAB, a, b);
+      vmull_32u(RaBAb, a, tmp);
+    }
+
+    void vmul_acc2(FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb, bool trn_aBAb = true) {
+      // words 2-0 of accumulator
+      vaddw_32u(s0, s0, RabAB->successor(FloatRegisterImpl::DOUBLE));
+      if (trn_aBAb) {
+        // words 3-1 of accumulator. phase 1
+        vtrn_64_32(RaBAb, RaBAb->successor(FloatRegisterImpl::DOUBLE));
+      }
+      // words 4-2 of accumulator
+      vaddw_32u(s2, s2, RabAB);
+      // words 3-1 of accumulator. phase 2
+      vpadal_128_u32(s1, RaBAb);
+    }
+
+    /**
+     * Simple unsigned 64-bit multiply a by b.
+     * Least significant 64 bits of result are written into register res,
+     * the rest are discarded.
+     * @param res 64-bit result
+     * @param a 64-bit operand
+     * @param b 64-bit operand
+     * @param tmp 128-bit temporary register
+     */
+    void vmul_simple(FloatRegister res, FloatRegister a, FloatRegister b, FloatRegister tmp) {
+      FloatRegister tmp2 = tmp->successor(FloatRegisterImpl::DOUBLE);
+      vmull_32u(tmp, a, b);
+      vrev64_64_32(tmp2, b);
+      vmul_64_32(tmp2, a, tmp2);
+      vpaddl_64_u32(tmp2, tmp2);
+      vshl_64_64(tmp2, tmp2, 32);
+      vadd_64_64(res, tmp, tmp2);
+    }
+
+    /**
+     * Converges the temp array and returns least significant 64 bits of the result.
+     * @param t0 the register to write the least significant 64 bits of result
+     * @param tmp 64-bit temporary register
+     */
+    void vmul_fin(FloatRegister t0, FloatRegister tmp1) {
+      FloatRegister abLow = s0;
+      FloatRegister abHigh = s0->successor(FloatRegisterImpl::DOUBLE);
+      FloatRegister aBAbLow = s1;
+
+      // words 0 and 1
+      vshr_64_u64(tmp1, abLow, 32);
+      vadd_64_64(tmp1, tmp1, abHigh);
+      vadd_64_64(tmp1, tmp1, aBAbLow);
+      vmov_64(t0, abLow);
+      vsli_64_64(t0, tmp1, 32);
+    }
+
+    /**
+     * Performs t0 = t1; t1 = t2; t2 = 0; represented as s0-s2.
+     * @param tmp 128-bit register
+     */
+    void shift_t(FloatRegister tmp) {
+      FloatRegister s0hi = s0->successor(FloatRegisterImpl::DOUBLE);
+      FloatRegister s1hi = s1->successor(FloatRegisterImpl::DOUBLE);
+      FloatRegister s2hi = s2->successor(FloatRegisterImpl::DOUBLE);
+      FloatRegister tmphi = tmp->successor(FloatRegisterImpl::DOUBLE);
+      vshr_64_u64(s0, s0, 32);
+      vaddl_32u(tmp, s1, s0hi);
+      vadd_64_64(s0, s0, tmp);
+      vshr_64_u64(s0, s0, 32);
+      vadd_64_64(tmphi, s0, tmphi);
+      vaddl_32u(s0, s1hi, s2);
+      vadd_64_64(s0, s0, tmphi);
+      vmov_64(s1, s2hi);
+      vmov_64_32(s1hi, 0);
+      vmov_128_32(s2, 0);
+    }
+
+  public:
+    /**
+     * Fast Montgomery multiplication.  The derivation of the
+     * algorithm is in A Cryptographic Library for the Motorola
+     * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
+     *
+     * Arguments:
+     *
+     * Inputs for multiplication:
+     *   c_rarg0   - int64 array elements a
+     *   c_rarg1   - int64 array elements b
+     *   c_rarg2   - int64 array elements n (the modulus)
+     *   c_rarg3   - int64 length
+     *   [sp]      - int64 inv
+     *   [sp+8]    - int64 array elements m (the result)
+     *
+     */
+    address generate_multiply() {
+      Label nothing;
+      align(CodeEntryAlignment);
+      address entry = pc();
+
+      cbz(Rlen, nothing);
+
+      enter();
+
+      // Push all call-saved registers
+      save_regs();
+
+      // load inv and m array pointer
+      add(Ri, rfp, 4);
+      vld1_64(inv, Address(Ri), ALIGN_STD);
+      ldr(Pm_base, Address(Ri, BytesPerLong));
+
+      lsr(Rlen, Rlen, 1);  // length in longwords = len/2
+
+      // let Px_base point on last 64-bit element of an array
+      add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pa_base, Pa_base, BytesPerLong);
+      if (!_squaring) {
+        add(Pb_base, Pb_base, Rlen, lsl(LogBytesPerLong));
+        sub(Pb_base, Pb_base, BytesPerLong);
+      }
+      add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pn_base, Pn_base, BytesPerLong);
+      add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pm_base, Pm_base, BytesPerLong);
+
+#ifndef PRODUCT
+      // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+      {
+        // Pn, Pm and s0 are used as a temporary
+        vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD);
+        vrev64_64_32(Rn, Rn);
+        vmul_simple(tmp, Rn, inv, s0);
+        vmov_f64(Pm, Pn, tmp);
+        andr(Pm, Pm, Pn);
+        cmn(Pm, 1);
+        Label ok;
+        b(ok, EQ); {
+          stop("broken inverse in Montgomery multiply");
+        } bind(ok);
+      }
+#endif
+
+      vmul_init();
+
+      block_comment("for (int i = 0; i < len; i++) {");
+      mov(Ri, 0); {
+        Label loop, end;
+        cmp(Ri, Rlen);
+        b(end, Assembler::GE);
+
+        bind(loop);
+        pre1(Ri);
+
+        block_comment("  for (j = i; j; j--) {"); {
+          mov(Rj, Ri);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
+        } block_comment("  } // j");
+
+        post1();
+        add(Ri, Ri, 1);
+        cmp(Ri, Rlen);
+        b(loop, Assembler::LT);
+        bind(end);
+        block_comment("} // i");
+      }
+
+      block_comment("for (int i = len; i < 2*len; i++) {");
+      mov(Ri, Rlen); {
+        Label loop, end;
+        cmp(Ri, Rlen, lsl(1));
+        b(end, Assembler::GE);
+
+        bind(loop);
+        pre2(Ri, Rlen);
+
+        block_comment("  for (j = len*2-i-1; j; j--) {"); {
+          lsl(Rj, Rlen, 1);
+          sub(Rj, Rj, Ri);
+          sub(Rj, Rj, 1);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
+        } block_comment("  } // j");
+
+        post2(Ri, Rlen);
+        add(Ri, Ri, 1);
+        cmp(Ri, Rlen, lsl(1));
+        b(loop, Assembler::LT);
+        bind(end);
+      }
+      block_comment("} // i");
+
+      FloatRegister t0 = RabAB; // use as temporary
+      vmul_fin(t0, tmp);
+      vmov_f64(Pa, Pb, t0);
+      normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base);
+
+      restore_regs();
+      leave();
+      bind(nothing);
+      ret(lr);
+
+      return entry;
+    }
+    // In C, approximately:
+
+    // void
+    // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
+    //                     unsigned long Pn_base[], unsigned long Pm_base[],
+    //                     unsigned long inv, int len) {
+    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+    //   unsigned long *Pa, *Pb, *Pn, *Pm;
+    //   unsigned long Ra, Rb, Rn, Rm;
+
+    //   int i;
+
+    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
+
+    //   for (i = 0; i < len; i++) {
+    //     int j;
+
+    //     Pa = Pa_base;
+    //     Pb = Pb_base + i;
+    //     Pm = Pm_base;
+    //     Pn = Pn_base + i;
+
+    //     Ra = *Pa;
+    //     Rb = *Pb;
+    //     Rm = *Pm;
+    //     Rn = *Pn;
+
+    //     int iters = i;
+    //     for (j = 0; iters--; j++) {
+    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
+    //       MACC(Ra, Rb, t0, t1, t2);
+    //       Ra = *++Pa;
+    //       Rb = *--Pb;
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+
+    //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
+    //     MACC(Ra, Rb, t0, t1, t2);
+    //     *Pm = Rm = t0 * inv;
+    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
+    //     MACC(Rm, Rn, t0, t1, t2);
+
+    //     assert(t0 == 0, "broken Montgomery multiply");
+
+    //     t0 = t1; t1 = t2; t2 = 0;
+    //   }
+
+    //   for (i = len; i < 2*len; i++) {
+    //     int j;
+
+    //     Pa = Pa_base + i-len;
+    //     Pb = Pb_base + len;
+    //     Pm = Pm_base + i-len;
+    //     Pn = Pn_base + len;
+
+    //     Ra = *++Pa;
+    //     Rb = *--Pb;
+    //     Rm = *++Pm;
+    //     Rn = *--Pn;
+
+    //     int iters = len*2-i-1;
+    //     for (j = i-len+1; iters--; j++) {
+    //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
+    //       MACC(Ra, Rb, t0, t1, t2);
+    //       Ra = *++Pa;
+    //       Rb = *--Pb;
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+
+    //     Pm_base[i-len] = t0;
+    //     t0 = t1; t1 = t2; t2 = 0;
+    //   }
+
+    //   while (t0)
+    //     t0 = sub(Pm_base, Pn_base, t0, len);
+    // }
+
+    /**
+     * Fast Montgomery squaring.  This uses asymptotically 25% fewer
+     * multiplies than Montgomery multiplication so it should be up to
+     * 25% faster.  However, its loop control is more complex and it
+     * may actually run slower on some machines.
+     *
+     * Arguments:
+     *
+     * Inputs:
+     *   c_rarg0   - int64 array elements a
+     *   c_rarg1   - int64 array elements n (the modulus)
+     *   c_rarg2   - int length
+     *   [sp]      - int inv
+     *   [sp+8]    - int array elements m (the result)
+     *
+     */
+    address generate_square() {
+      align(CodeEntryAlignment);
+      address entry = pc();
+
+      enter();
+
+      save_regs();
+
+      // load inv and m array pointer
+      add(Ri, rfp, 4);
+      vld1_64(inv, Address(Ri), ALIGN_STD);
+      ldr(Pm_base, Address(Ri, BytesPerLong));
+
+      lsr(Rlen, Rlen, 1);  // length in longwords = len/2
+
+      // let Px_base point on last 64-bit element of an array
+      add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pa_base, Pa_base, BytesPerLong);
+      add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pn_base, Pn_base, BytesPerLong);
+      add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong));
+      sub(Pm_base, Pm_base, BytesPerLong);
+
+#ifndef PRODUCT
+      // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
+      {
+        // Pn, Pm and s0 are used as a temporary
+        vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD);
+        vrev64_64_32(Rn, Rn);
+        vmul_simple(tmp, Rn, inv, s0);
+        vmov_f64(Pm, Pn, tmp);
+        andr(Pm, Pm, Pn);
+        cmn(Pm, 1);
+        Label ok;
+        b(ok, EQ); {
+          stop("broken inverse in Montgomery square");
+        } bind(ok);
+      }
+#endif
+
+      vmul_init();
+
+      block_comment("for (int i = 0; i < len; i++) {");
+      mov(Ri, 0); {
+        Label loop, end;
+        bind(loop);
+        cmp(Ri, Rlen);
+        b(end, GE);
+
+        pre1(Ri);
+
+        block_comment("for (j = (i+1)/2; j; j--) {"); {
+          add(Rj, Ri, 1);
+          lsr(Rj, Rj, 1);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
+        } block_comment("  } // j");
+
+        last_squaring(Ri);
+
+        block_comment("  for (j = i/2; j; j--) {"); {
+          lsr(Rj, Ri, 1);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
+        } block_comment("  } // j");
+
+        post1_squaring();
+        add(Ri, Ri, 1);
+        cmp(Ri, Rlen);
+        b(loop, LT);
+
+        bind(end);
+        block_comment("} // i");
+      }
+
+      block_comment("for (int i = len; i < 2*len; i++) {");
+      mov(Ri, Rlen); {
+        Label loop, end;
+        bind(loop);
+        cmp(Ri, Rlen, lsl(1));
+        b(end, GE);
+
+        pre2(Ri, Rlen);
+
+        block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
+          lsl(Rj, Rlen, 1);
+          sub(Rj, Rj, Ri);
+          sub(Rj, Rj, 1);
+          lsr(Rj, Rj, 1);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
+        } block_comment("  } // j");
+
+        last_squaring(Ri);
+
+        block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
+          lsl(Rj, Rlen, 1);
+          sub(Rj, Rj, Ri);
+          lsr(Rj, Rj, 1);
+          unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
+        } block_comment("  } // j");
+
+        post2(Ri, Rlen);
+        add(Ri, Ri, 1);
+        cmp(Ri, Rlen, lsl(1));
+
+        b(loop, LT);
+        bind(end);
+        block_comment("} // i");
+      }
+
+      FloatRegister t0 = RabAB; // use as temporary
+      vmul_fin(t0, tmp);
+      vmov_f64(Pa, Pb, t0);
+      normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base);
+
+      restore_regs();
+      leave();
+      ret(lr);
+
+      return entry;
+    }
+    // In C, approximately:
+
+    // void
+    // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
+    //                   unsigned long Pm_base[], unsigned long inv, int len) {
+    //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
+    //   unsigned long *Pa, *Pb, *Pn, *Pm;
+    //   unsigned long Ra, Rb, Rn, Rm;
+
+    //   int i;
+
+    //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
+
+    //   for (i = 0; i < len; i++) {
+    //     int j;
+
+    //     Pa = Pa_base;
+    //     Pb = Pa_base + i;
+    //     Pm = Pm_base;
+    //     Pn = Pn_base + i;
+
+    //     Ra = *Pa;
+    //     Rb = *Pb;
+    //     Rm = *Pm;
+    //     Rn = *Pn;
+
+    //     int iters = (i+1)/2;
+    //     for (j = 0; iters--; j++) {
+    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
+    //       MACC2(Ra, Rb, t0, t1, t2);
+    //       Ra = *++Pa;
+    //       Rb = *--Pb;
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+    //     if ((i & 1) == 0) {
+    //       assert(Ra == Pa_base[j], "must be");
+    //       MACC(Ra, Ra, t0, t1, t2);
+    //     }
+    //     iters = i/2;
+    //     assert(iters == i-j, "must be");
+    //     for (; iters--; j++) {
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+
+    //     *Pm = Rm = t0 * inv;
+    //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
+    //     MACC(Rm, Rn, t0, t1, t2);
+
+    //     assert(t0 == 0, "broken Montgomery multiply");
+
+    //     t0 = t1; t1 = t2; t2 = 0;
+    //   }
+
+    //   for (i = len; i < 2*len; i++) {
+    //     int start = i-len+1;
+    //     int end = start + (len - start)/2;
+    //     int j;
+
+    //     Pa = Pa_base + i-len;
+    //     Pb = Pa_base + len;
+    //     Pm = Pm_base + i-len;
+    //     Pn = Pn_base + len;
+
+    //     Ra = *++Pa;
+    //     Rb = *--Pb;
+    //     Rm = *++Pm;
+    //     Rn = *--Pn;
+
+    //     int iters = (2*len-i-1)/2;
+    //     assert(iters == end-start, "must be");
+    //     for (j = start; iters--; j++) {
+    //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
+    //       MACC2(Ra, Rb, t0, t1, t2);
+    //       Ra = *++Pa;
+    //       Rb = *--Pb;
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+    //     if ((i & 1) == 0) {
+    //       assert(Ra == Pa_base[j], "must be");
+    //       MACC(Ra, Ra, t0, t1, t2);
+    //     }
+    //     iters =  (2*len-i)/2;
+    //     assert(iters == len-j, "must be");
+    //     for (; iters--; j++) {
+    //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
+    //       MACC(Rm, Rn, t0, t1, t2);
+    //       Rm = *++Pm;
+    //       Rn = *--Pn;
+    //     }
+    //     Pm_base[i-len] = t0;
+    //     t0 = t1; t1 = t2; t2 = 0;
+    //   }
+
+    //   while (t0)
+    //     t0 = sub(Pm_base, Pn_base, t0, len);
+    // }
+  };
+
+  // Initialization
+  void generate_initial() {
+    // Generate initial stubs and initializes the entry points
+
+    // entry points that exist in all platforms Note: This is code
+    // that could be shared among different platforms - however the
+    // benefit seems to be smaller than the disadvantage of having a
+    // much more complicated generator structure. See also comment in
+    // stubRoutines.hpp.
+
+    StubRoutines::_forward_exception_entry = generate_forward_exception();
+
+    StubRoutines::_call_stub_entry =
+      generate_call_stub(StubRoutines::_call_stub_return_address);
+
+    // is referenced by megamorphic call
+    StubRoutines::_catch_exception_entry = generate_catch_exception();
+
+    // Build this early so it's available for the interpreter.
+    StubRoutines::_throw_StackOverflowError_entry =
+      generate_throw_exception("StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::throw_StackOverflowError));
+    StubRoutines::_throw_delayed_StackOverflowError_entry =
+      generate_throw_exception("delayed StackOverflowError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+			                        SharedRuntime::throw_delayed_StackOverflowError));
+    if (UseCRC32Intrinsics) {
+      // set table address before stub generation which use it
+      StubRoutines::_crc_table_adr = (address)StubRoutines::aarch32::_crc_table;
+      StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(false);
+    }
+
+    if (UseCRC32CIntrinsics) {
+      // set table address before stub generation which use it
+      StubRoutines::_crc32c_table_addr = (address)StubRoutines::aarch32::_crc32c_table;
+      StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32(true);
+    }
+
+    if (UseAESIntrinsics) {
+    // set table address before stub generation which use it
+      StubRoutines::_aes_table_te_addr = (address)StubRoutines::aarch32::_aes_te_table;
+      StubRoutines::_aes_table_td_addr = (address)StubRoutines::aarch32::_aes_td_table;
+
+      StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
+      StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
+
+      if (UseNeon) {
+        // AES CBC implementation uses NEON insructions
+        StubRoutines::_cipherBlockChaining_encryptAESCrypt_special = generate_cipherBlockChaining_encryptAESCrypt(false);
+        StubRoutines::_cipherBlockChaining_decryptAESCrypt_special = generate_cipherBlockChaining_decryptAESCrypt(false);
+        StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(true);
+        StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(true);
+      }
+    }
+
+    if (UseSHA1Intrinsics) {
+      StubRoutines::_sha1_table_addr = (address)StubRoutines::aarch32::_sha1_table;
+      StubRoutines::_sha1_implCompress   = generate_sha_implCompress();
+    }
+    if (UseSHA256Intrinsics) {
+      StubRoutines::_sha256_table_addr = (address)StubRoutines::aarch32::_sha256_table;
+      StubRoutines::_sha256_implCompress   = generate_sha256_implCompress();
+    }
+    if (UseSHA512Intrinsics) {
+      StubRoutines::_sha512_table_addr = (address)StubRoutines::aarch32::_sha512_table;
+      StubRoutines::_sha512_implCompress   = generate_sha512_implCompress();
+    }
+
+    NativeCall::init();
+  }
+#undef __
+#define __ _masm->
+
+#ifdef COMPILER2
+  address generate_idiv_irem_stub(const char *name, bool want_mod) {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", name);
+
+    address start = __ pc();
+
+    BLOCK_COMMENT("Entry:");
+    __ enter(); // required for proper stackwalking of RuntimeStub frame
+    // C2 knows this kills rscratch1 and rscratch2, so not save them
+
+    __ divide(r0, r1, r2, 32, want_mod);
+
+    __ leave(); // required for proper stackwalking of RuntimeStub frame
+    __ ret(lr);
+
+    return start;
+  }
+
+  // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
+  // Arguments :
+  //
+  //      ret  : R0, returned
+  //      icc/xcc: set as R0 (depending on wordSize)
+  //      sub  : R1, argument, not changed
+  //      super: R2, argument, not changed
+  //      raddr: LR, blown by call
+  address generate_partial_subtype_check() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
+    address start = __ pc();
+
+    // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
+
+    // R0 used as tmp_reg (in addition to return reg)
+    Register sub_klass = r1;
+    Register super_klass = r2;
+    Register tmp_reg2 = r3;
+    Register tmp_reg3 = r4;
+
+// inc_counter_np kills rscratch1 and rscratch2
+#define saved_set RegSet::of(tmp_reg2, tmp_reg3, rscratch1, rscratch2)
+
+    Label L_loop, L_fail;
+
+    int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
+
+    // fast check should be redundant
+
+    // slow check
+    {
+      __ push(saved_set, sp);
+
+      // a couple of useful fields in sub_klass:
+      int ss_offset = in_bytes(Klass::secondary_supers_offset());
+
+      // Do a linear scan of the secondary super-klass chain.
+      // This code is rarely used, so simplicity is a virtue here.
+
+      inc_counter_np(SharedRuntime::_partial_subtype_ctr);
+
+      Register scan_temp = tmp_reg2;
+      Register count_temp = tmp_reg3;
+
+      // We will consult the secondary-super array.
+      __ ldr(scan_temp, Address(sub_klass, ss_offset));
+
+      Register search_key = super_klass;
+
+      // Load the array length.
+      __ ldr(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
+      __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
+
+      __ add(count_temp, count_temp, 1);
+
+      // Top of search loop
+      __ bind(L_loop);
+      // Notes:
+      //  scan_temp starts at the array elements
+      //  count_temp is 1+size
+      __ subs(count_temp, count_temp, 1);
+      __ b(L_fail, Assembler::EQ); // not found in the array
+
+      // Load next super to check
+      // In the array of super classes elements are pointer sized.
+      int element_size = wordSize;
+      __ ldr(r0, __ post(scan_temp, element_size));
+
+      // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
+      __ subs(r0, r0, search_key); // set R0 to 0 on success (and flags to eq)
+
+      // A miss means we are NOT a subtype and need to keep looping
+      __ b(L_loop, Assembler::NE);
+
+      // Falling out the bottom means we found a hit; we ARE a subtype
+
+      // Success.  Cache the super we found and proceed in triumph.
+      __ str(super_klass, Address(sub_klass, sc_offset));
+
+      // Return success
+      // R0 is already 0 and flags are already set to eq
+      __ pop(saved_set, sp);
+      __ ret(lr);
+
+      // Return failure
+      __ bind(L_fail);
+      __ movs_i(r0, 1); // sets the flags
+      __ pop(saved_set, sp);
+      __ ret(lr);
+    }
+    return start;
+  }
+#undef saved_set
+
+  address generate_string_compress_neon() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "string_compress_neon");
+    address start = __ pc();
+
+    Register src = r2;
+    Register dst = r1;
+    Register len = r3;
+    Register t = r9;
+    Register t2 = r12;
+    FloatRegister a1 = d0;
+    FloatRegister a2 = d1;
+    FloatRegister b1 = d2;
+    FloatRegister b2 = d3;
+    Register result = r0;
+
+    Label Lloop2, Lset_result;
+
+    __ sub(len, len, 8+16);
+    __ vld1_64(a1, a2, __ post(src, 16), Assembler::ALIGN_STD);
+    __ bind(Lloop2); {
+      __ vld1_64(b1, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper
+      __ vld1_64(b2, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vmov_f64(t, t2, a2);
+      __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD);
+      __ orrs(t, t, t2);
+      __ b(Lset_result, Assembler::NE);
+
+      __ vld1_64(a1, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vuzp_64_8(b1, b2); // b1 now has lower bytes, b2 upper
+      __ vld1_64(a2, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vmov_f64(t, t2, b2);
+      __ vst1_64(b1, __ post(dst, 8), Assembler::ALIGN_STD);
+      __ orrs(t, t, t2);
+      __ b(Lset_result, Assembler::NE);
+      __ subs(len, len, 16);
+      __ b(Lloop2, Assembler::GE);
+    }
+
+    __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper
+    __ vmov_f64(t, t2, a2);
+    __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD);
+    __ orrs(t, t, t2);
+    __ b(Lset_result, Assembler::NE);
+    __ adds(len, len, 16);
+    __ ret(lr); // leaves Z-flag to check for per-char slow case
+
+    __ bind(Lset_result);
+    __ movs_i(result, 0, Assembler::NE); // sets Z flag
+    __ ret(lr);
+
+    return start;
+  }
+
+  address generate_string_inflate_neon() {
+    __ align(CodeEntryAlignment);
+    StubCodeMark mark(this, "StubRoutines", "string_inflate_neon");
+    address start = __ pc();
+
+    Register src = r0;
+    Register dst = r1;
+    Register len = r2;
+    FloatRegister a1 = d0;
+
+    Label Lloop2;
+
+    __ sub(len, len, 16);
+    __ bind(Lloop2); {
+      __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vmovl_8u(q0, d0);
+      __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD);
+      __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD);
+      __ vmovl_8u(q0, d0);
+      __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD);
+      __ subs(len, len, 16);
+      __ b(Lloop2, Assembler::HS);
+    }
+
+    __ adds(len, len, 16); // sets Z flag to check in intrinsic
+    __ ret(lr);
+
+    return start;
+  }
+
+  void generate_c2_stubs() {
+    StubRoutines::aarch32::_idiv_entry =
+            generate_idiv_irem_stub("idiv_c2_stub", false);
+    StubRoutines::aarch32::_irem_entry =
+            generate_idiv_irem_stub("irem_c2_stub", true);
+    StubRoutines::aarch32::_partial_subtype_check =
+            generate_partial_subtype_check();
+    if (VM_Version::features() & FT_AdvSIMD) {
+      StubRoutines::aarch32::_string_compress_neon =
+              generate_string_compress_neon();
+      StubRoutines::aarch32::_string_inflate_neon =
+              generate_string_inflate_neon();
+    }
+  }
+#endif
+
+  void generate_all() {
+    // support for verify_oop (must happen after universe_init)
+    StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
+    StubRoutines::_throw_AbstractMethodError_entry =
+      generate_throw_exception("AbstractMethodError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_AbstractMethodError));
+
+    StubRoutines::_throw_IncompatibleClassChangeError_entry =
+      generate_throw_exception("IncompatibleClassChangeError throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_IncompatibleClassChangeError));
+
+    StubRoutines::_throw_NullPointerException_at_call_entry =
+      generate_throw_exception("NullPointerException at call throw_exception",
+                               CAST_FROM_FN_PTR(address,
+                                                SharedRuntime::
+                                                throw_NullPointerException_at_call));
+
+    // arraycopy stubs used by compilers
+    generate_arraycopy_stubs();
+
+#ifdef COMPILER2
+    if (UseMultiplyToLenIntrinsic) {
+      StubRoutines::_multiplyToLen = generate_multiplyToLen();
+      StubRoutines::_mulAdd = generate_mulAdd();
+    }
+#endif
+
+    if (UseMontgomeryMultiplyIntrinsic) {
+      StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
+      MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
+      StubRoutines::_montgomeryMultiply = g.generate_multiply();
+    }
+
+    if (UseMontgomerySquareIntrinsic) {
+      StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
+      MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
+      StubRoutines::_montgomerySquare = g.generate_square();
+    }
+
+    // Safefetch stubs.
+    generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
+                                                       &StubRoutines::_safefetch32_fault_pc,
+                                                       &StubRoutines::_safefetch32_continuation_pc);
+    generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
+                                                       &StubRoutines::_safefetchN_fault_pc,
+                                                       &StubRoutines::_safefetchN_continuation_pc);
+
+#ifdef COMPILER2
+    generate_c2_stubs();
+#endif
+  }
+
+ public:
+  StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
+    if (all) {
+      generate_all();
+    } else {
+      generate_initial();
+    }
+
+  }
+}; // end class declaration
+
+void StubGenerator_generate(CodeBuffer* code, bool all) {
+  StubGenerator g(code, all);
+}
--- /dev/null	2018-09-25 19:25:26.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/stubRoutines_aarch32.cpp	2018-09-25 19:25:26.000000000 +0300
@@ -0,0 +1,797 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/thread.inline.hpp"
+#include "utilities/globalDefinitions.hpp"
+
+// Implementation of the platform-specific part of StubRoutines - for
+// a description of how to extend it, see the stubRoutines.hpp file.
+
+#ifdef COMPILER2
+address StubRoutines::aarch32::_idiv_entry = NULL;
+address StubRoutines::aarch32::_irem_entry = NULL;
+address StubRoutines::aarch32::_partial_subtype_check = NULL;
+address StubRoutines::aarch32::_string_compress_neon = NULL;
+address StubRoutines::aarch32::_string_inflate_neon = NULL;
+#endif
+/**
+ *  crc_table[] from jdk/src/share/native/java/util/zip/zlib-1.2.5/crc32.h
+ */
+juint StubRoutines::aarch32::_crc_table[]
+                   ATTRIBUTE_ALIGNED(4096) =
+{
+    // Table 0
+    0x00000000UL, 0x77073096UL, 0xee0e612cUL, 0x990951baUL, 0x076dc419UL,
+    0x706af48fUL, 0xe963a535UL, 0x9e6495a3UL, 0x0edb8832UL, 0x79dcb8a4UL,
+    0xe0d5e91eUL, 0x97d2d988UL, 0x09b64c2bUL, 0x7eb17cbdUL, 0xe7b82d07UL,
+    0x90bf1d91UL, 0x1db71064UL, 0x6ab020f2UL, 0xf3b97148UL, 0x84be41deUL,
+    0x1adad47dUL, 0x6ddde4ebUL, 0xf4d4b551UL, 0x83d385c7UL, 0x136c9856UL,
+    0x646ba8c0UL, 0xfd62f97aUL, 0x8a65c9ecUL, 0x14015c4fUL, 0x63066cd9UL,
+    0xfa0f3d63UL, 0x8d080df5UL, 0x3b6e20c8UL, 0x4c69105eUL, 0xd56041e4UL,
+    0xa2677172UL, 0x3c03e4d1UL, 0x4b04d447UL, 0xd20d85fdUL, 0xa50ab56bUL,
+    0x35b5a8faUL, 0x42b2986cUL, 0xdbbbc9d6UL, 0xacbcf940UL, 0x32d86ce3UL,
+    0x45df5c75UL, 0xdcd60dcfUL, 0xabd13d59UL, 0x26d930acUL, 0x51de003aUL,
+    0xc8d75180UL, 0xbfd06116UL, 0x21b4f4b5UL, 0x56b3c423UL, 0xcfba9599UL,
+    0xb8bda50fUL, 0x2802b89eUL, 0x5f058808UL, 0xc60cd9b2UL, 0xb10be924UL,
+    0x2f6f7c87UL, 0x58684c11UL, 0xc1611dabUL, 0xb6662d3dUL, 0x76dc4190UL,
+    0x01db7106UL, 0x98d220bcUL, 0xefd5102aUL, 0x71b18589UL, 0x06b6b51fUL,
+    0x9fbfe4a5UL, 0xe8b8d433UL, 0x7807c9a2UL, 0x0f00f934UL, 0x9609a88eUL,
+    0xe10e9818UL, 0x7f6a0dbbUL, 0x086d3d2dUL, 0x91646c97UL, 0xe6635c01UL,
+    0x6b6b51f4UL, 0x1c6c6162UL, 0x856530d8UL, 0xf262004eUL, 0x6c0695edUL,
+    0x1b01a57bUL, 0x8208f4c1UL, 0xf50fc457UL, 0x65b0d9c6UL, 0x12b7e950UL,
+    0x8bbeb8eaUL, 0xfcb9887cUL, 0x62dd1ddfUL, 0x15da2d49UL, 0x8cd37cf3UL,
+    0xfbd44c65UL, 0x4db26158UL, 0x3ab551ceUL, 0xa3bc0074UL, 0xd4bb30e2UL,
+    0x4adfa541UL, 0x3dd895d7UL, 0xa4d1c46dUL, 0xd3d6f4fbUL, 0x4369e96aUL,
+    0x346ed9fcUL, 0xad678846UL, 0xda60b8d0UL, 0x44042d73UL, 0x33031de5UL,
+    0xaa0a4c5fUL, 0xdd0d7cc9UL, 0x5005713cUL, 0x270241aaUL, 0xbe0b1010UL,
+    0xc90c2086UL, 0x5768b525UL, 0x206f85b3UL, 0xb966d409UL, 0xce61e49fUL,
+    0x5edef90eUL, 0x29d9c998UL, 0xb0d09822UL, 0xc7d7a8b4UL, 0x59b33d17UL,
+    0x2eb40d81UL, 0xb7bd5c3bUL, 0xc0ba6cadUL, 0xedb88320UL, 0x9abfb3b6UL,
+    0x03b6e20cUL, 0x74b1d29aUL, 0xead54739UL, 0x9dd277afUL, 0x04db2615UL,
+    0x73dc1683UL, 0xe3630b12UL, 0x94643b84UL, 0x0d6d6a3eUL, 0x7a6a5aa8UL,
+    0xe40ecf0bUL, 0x9309ff9dUL, 0x0a00ae27UL, 0x7d079eb1UL, 0xf00f9344UL,
+    0x8708a3d2UL, 0x1e01f268UL, 0x6906c2feUL, 0xf762575dUL, 0x806567cbUL,
+    0x196c3671UL, 0x6e6b06e7UL, 0xfed41b76UL, 0x89d32be0UL, 0x10da7a5aUL,
+    0x67dd4accUL, 0xf9b9df6fUL, 0x8ebeeff9UL, 0x17b7be43UL, 0x60b08ed5UL,
+    0xd6d6a3e8UL, 0xa1d1937eUL, 0x38d8c2c4UL, 0x4fdff252UL, 0xd1bb67f1UL,
+    0xa6bc5767UL, 0x3fb506ddUL, 0x48b2364bUL, 0xd80d2bdaUL, 0xaf0a1b4cUL,
+    0x36034af6UL, 0x41047a60UL, 0xdf60efc3UL, 0xa867df55UL, 0x316e8eefUL,
+    0x4669be79UL, 0xcb61b38cUL, 0xbc66831aUL, 0x256fd2a0UL, 0x5268e236UL,
+    0xcc0c7795UL, 0xbb0b4703UL, 0x220216b9UL, 0x5505262fUL, 0xc5ba3bbeUL,
+    0xb2bd0b28UL, 0x2bb45a92UL, 0x5cb36a04UL, 0xc2d7ffa7UL, 0xb5d0cf31UL,
+    0x2cd99e8bUL, 0x5bdeae1dUL, 0x9b64c2b0UL, 0xec63f226UL, 0x756aa39cUL,
+    0x026d930aUL, 0x9c0906a9UL, 0xeb0e363fUL, 0x72076785UL, 0x05005713UL,
+    0x95bf4a82UL, 0xe2b87a14UL, 0x7bb12baeUL, 0x0cb61b38UL, 0x92d28e9bUL,
+    0xe5d5be0dUL, 0x7cdcefb7UL, 0x0bdbdf21UL, 0x86d3d2d4UL, 0xf1d4e242UL,
+    0x68ddb3f8UL, 0x1fda836eUL, 0x81be16cdUL, 0xf6b9265bUL, 0x6fb077e1UL,
+    0x18b74777UL, 0x88085ae6UL, 0xff0f6a70UL, 0x66063bcaUL, 0x11010b5cUL,
+    0x8f659effUL, 0xf862ae69UL, 0x616bffd3UL, 0x166ccf45UL, 0xa00ae278UL,
+    0xd70dd2eeUL, 0x4e048354UL, 0x3903b3c2UL, 0xa7672661UL, 0xd06016f7UL,
+    0x4969474dUL, 0x3e6e77dbUL, 0xaed16a4aUL, 0xd9d65adcUL, 0x40df0b66UL,
+    0x37d83bf0UL, 0xa9bcae53UL, 0xdebb9ec5UL, 0x47b2cf7fUL, 0x30b5ffe9UL,
+    0xbdbdf21cUL, 0xcabac28aUL, 0x53b39330UL, 0x24b4a3a6UL, 0xbad03605UL,
+    0xcdd70693UL, 0x54de5729UL, 0x23d967bfUL, 0xb3667a2eUL, 0xc4614ab8UL,
+    0x5d681b02UL, 0x2a6f2b94UL, 0xb40bbe37UL, 0xc30c8ea1UL, 0x5a05df1bUL,
+    0x2d02ef8dUL,
+
+    // Table 1
+    0x00000000UL, 0x191b3141UL, 0x32366282UL, 0x2b2d53c3UL, 0x646cc504UL,
+    0x7d77f445UL, 0x565aa786UL, 0x4f4196c7UL, 0xc8d98a08UL, 0xd1c2bb49UL,
+    0xfaefe88aUL, 0xe3f4d9cbUL, 0xacb54f0cUL, 0xb5ae7e4dUL, 0x9e832d8eUL,
+    0x87981ccfUL, 0x4ac21251UL, 0x53d92310UL, 0x78f470d3UL, 0x61ef4192UL,
+    0x2eaed755UL, 0x37b5e614UL, 0x1c98b5d7UL, 0x05838496UL, 0x821b9859UL,
+    0x9b00a918UL, 0xb02dfadbUL, 0xa936cb9aUL, 0xe6775d5dUL, 0xff6c6c1cUL,
+    0xd4413fdfUL, 0xcd5a0e9eUL, 0x958424a2UL, 0x8c9f15e3UL, 0xa7b24620UL,
+    0xbea97761UL, 0xf1e8e1a6UL, 0xe8f3d0e7UL, 0xc3de8324UL, 0xdac5b265UL,
+    0x5d5daeaaUL, 0x44469febUL, 0x6f6bcc28UL, 0x7670fd69UL, 0x39316baeUL,
+    0x202a5aefUL, 0x0b07092cUL, 0x121c386dUL, 0xdf4636f3UL, 0xc65d07b2UL,
+    0xed705471UL, 0xf46b6530UL, 0xbb2af3f7UL, 0xa231c2b6UL, 0x891c9175UL,
+    0x9007a034UL, 0x179fbcfbUL, 0x0e848dbaUL, 0x25a9de79UL, 0x3cb2ef38UL,
+    0x73f379ffUL, 0x6ae848beUL, 0x41c51b7dUL, 0x58de2a3cUL, 0xf0794f05UL,
+    0xe9627e44UL, 0xc24f2d87UL, 0xdb541cc6UL, 0x94158a01UL, 0x8d0ebb40UL,
+    0xa623e883UL, 0xbf38d9c2UL, 0x38a0c50dUL, 0x21bbf44cUL, 0x0a96a78fUL,
+    0x138d96ceUL, 0x5ccc0009UL, 0x45d73148UL, 0x6efa628bUL, 0x77e153caUL,
+    0xbabb5d54UL, 0xa3a06c15UL, 0x888d3fd6UL, 0x91960e97UL, 0xded79850UL,
+    0xc7cca911UL, 0xece1fad2UL, 0xf5facb93UL, 0x7262d75cUL, 0x6b79e61dUL,
+    0x4054b5deUL, 0x594f849fUL, 0x160e1258UL, 0x0f152319UL, 0x243870daUL,
+    0x3d23419bUL, 0x65fd6ba7UL, 0x7ce65ae6UL, 0x57cb0925UL, 0x4ed03864UL,
+    0x0191aea3UL, 0x188a9fe2UL, 0x33a7cc21UL, 0x2abcfd60UL, 0xad24e1afUL,
+    0xb43fd0eeUL, 0x9f12832dUL, 0x8609b26cUL, 0xc94824abUL, 0xd05315eaUL,
+    0xfb7e4629UL, 0xe2657768UL, 0x2f3f79f6UL, 0x362448b7UL, 0x1d091b74UL,
+    0x04122a35UL, 0x4b53bcf2UL, 0x52488db3UL, 0x7965de70UL, 0x607eef31UL,
+    0xe7e6f3feUL, 0xfefdc2bfUL, 0xd5d0917cUL, 0xcccba03dUL, 0x838a36faUL,
+    0x9a9107bbUL, 0xb1bc5478UL, 0xa8a76539UL, 0x3b83984bUL, 0x2298a90aUL,
+    0x09b5fac9UL, 0x10aecb88UL, 0x5fef5d4fUL, 0x46f46c0eUL, 0x6dd93fcdUL,
+    0x74c20e8cUL, 0xf35a1243UL, 0xea412302UL, 0xc16c70c1UL, 0xd8774180UL,
+    0x9736d747UL, 0x8e2de606UL, 0xa500b5c5UL, 0xbc1b8484UL, 0x71418a1aUL,
+    0x685abb5bUL, 0x4377e898UL, 0x5a6cd9d9UL, 0x152d4f1eUL, 0x0c367e5fUL,
+    0x271b2d9cUL, 0x3e001cddUL, 0xb9980012UL, 0xa0833153UL, 0x8bae6290UL,
+    0x92b553d1UL, 0xddf4c516UL, 0xc4eff457UL, 0xefc2a794UL, 0xf6d996d5UL,
+    0xae07bce9UL, 0xb71c8da8UL, 0x9c31de6bUL, 0x852aef2aUL, 0xca6b79edUL,
+    0xd37048acUL, 0xf85d1b6fUL, 0xe1462a2eUL, 0x66de36e1UL, 0x7fc507a0UL,
+    0x54e85463UL, 0x4df36522UL, 0x02b2f3e5UL, 0x1ba9c2a4UL, 0x30849167UL,
+    0x299fa026UL, 0xe4c5aeb8UL, 0xfdde9ff9UL, 0xd6f3cc3aUL, 0xcfe8fd7bUL,
+    0x80a96bbcUL, 0x99b25afdUL, 0xb29f093eUL, 0xab84387fUL, 0x2c1c24b0UL,
+    0x350715f1UL, 0x1e2a4632UL, 0x07317773UL, 0x4870e1b4UL, 0x516bd0f5UL,
+    0x7a468336UL, 0x635db277UL, 0xcbfad74eUL, 0xd2e1e60fUL, 0xf9ccb5ccUL,
+    0xe0d7848dUL, 0xaf96124aUL, 0xb68d230bUL, 0x9da070c8UL, 0x84bb4189UL,
+    0x03235d46UL, 0x1a386c07UL, 0x31153fc4UL, 0x280e0e85UL, 0x674f9842UL,
+    0x7e54a903UL, 0x5579fac0UL, 0x4c62cb81UL, 0x8138c51fUL, 0x9823f45eUL,
+    0xb30ea79dUL, 0xaa1596dcUL, 0xe554001bUL, 0xfc4f315aUL, 0xd7626299UL,
+    0xce7953d8UL, 0x49e14f17UL, 0x50fa7e56UL, 0x7bd72d95UL, 0x62cc1cd4UL,
+    0x2d8d8a13UL, 0x3496bb52UL, 0x1fbbe891UL, 0x06a0d9d0UL, 0x5e7ef3ecUL,
+    0x4765c2adUL, 0x6c48916eUL, 0x7553a02fUL, 0x3a1236e8UL, 0x230907a9UL,
+    0x0824546aUL, 0x113f652bUL, 0x96a779e4UL, 0x8fbc48a5UL, 0xa4911b66UL,
+    0xbd8a2a27UL, 0xf2cbbce0UL, 0xebd08da1UL, 0xc0fdde62UL, 0xd9e6ef23UL,
+    0x14bce1bdUL, 0x0da7d0fcUL, 0x268a833fUL, 0x3f91b27eUL, 0x70d024b9UL,
+    0x69cb15f8UL, 0x42e6463bUL, 0x5bfd777aUL, 0xdc656bb5UL, 0xc57e5af4UL,
+    0xee530937UL, 0xf7483876UL, 0xb809aeb1UL, 0xa1129ff0UL, 0x8a3fcc33UL,
+    0x9324fd72UL,
+
+    // Table 2
+    0x00000000UL, 0x01c26a37UL, 0x0384d46eUL, 0x0246be59UL, 0x0709a8dcUL,
+    0x06cbc2ebUL, 0x048d7cb2UL, 0x054f1685UL, 0x0e1351b8UL, 0x0fd13b8fUL,
+    0x0d9785d6UL, 0x0c55efe1UL, 0x091af964UL, 0x08d89353UL, 0x0a9e2d0aUL,
+    0x0b5c473dUL, 0x1c26a370UL, 0x1de4c947UL, 0x1fa2771eUL, 0x1e601d29UL,
+    0x1b2f0bacUL, 0x1aed619bUL, 0x18abdfc2UL, 0x1969b5f5UL, 0x1235f2c8UL,
+    0x13f798ffUL, 0x11b126a6UL, 0x10734c91UL, 0x153c5a14UL, 0x14fe3023UL,
+    0x16b88e7aUL, 0x177ae44dUL, 0x384d46e0UL, 0x398f2cd7UL, 0x3bc9928eUL,
+    0x3a0bf8b9UL, 0x3f44ee3cUL, 0x3e86840bUL, 0x3cc03a52UL, 0x3d025065UL,
+    0x365e1758UL, 0x379c7d6fUL, 0x35dac336UL, 0x3418a901UL, 0x3157bf84UL,
+    0x3095d5b3UL, 0x32d36beaUL, 0x331101ddUL, 0x246be590UL, 0x25a98fa7UL,
+    0x27ef31feUL, 0x262d5bc9UL, 0x23624d4cUL, 0x22a0277bUL, 0x20e69922UL,
+    0x2124f315UL, 0x2a78b428UL, 0x2bbade1fUL, 0x29fc6046UL, 0x283e0a71UL,
+    0x2d711cf4UL, 0x2cb376c3UL, 0x2ef5c89aUL, 0x2f37a2adUL, 0x709a8dc0UL,
+    0x7158e7f7UL, 0x731e59aeUL, 0x72dc3399UL, 0x7793251cUL, 0x76514f2bUL,
+    0x7417f172UL, 0x75d59b45UL, 0x7e89dc78UL, 0x7f4bb64fUL, 0x7d0d0816UL,
+    0x7ccf6221UL, 0x798074a4UL, 0x78421e93UL, 0x7a04a0caUL, 0x7bc6cafdUL,
+    0x6cbc2eb0UL, 0x6d7e4487UL, 0x6f38fadeUL, 0x6efa90e9UL, 0x6bb5866cUL,
+    0x6a77ec5bUL, 0x68315202UL, 0x69f33835UL, 0x62af7f08UL, 0x636d153fUL,
+    0x612bab66UL, 0x60e9c151UL, 0x65a6d7d4UL, 0x6464bde3UL, 0x662203baUL,
+    0x67e0698dUL, 0x48d7cb20UL, 0x4915a117UL, 0x4b531f4eUL, 0x4a917579UL,
+    0x4fde63fcUL, 0x4e1c09cbUL, 0x4c5ab792UL, 0x4d98dda5UL, 0x46c49a98UL,
+    0x4706f0afUL, 0x45404ef6UL, 0x448224c1UL, 0x41cd3244UL, 0x400f5873UL,
+    0x4249e62aUL, 0x438b8c1dUL, 0x54f16850UL, 0x55330267UL, 0x5775bc3eUL,
+    0x56b7d609UL, 0x53f8c08cUL, 0x523aaabbUL, 0x507c14e2UL, 0x51be7ed5UL,
+    0x5ae239e8UL, 0x5b2053dfUL, 0x5966ed86UL, 0x58a487b1UL, 0x5deb9134UL,
+    0x5c29fb03UL, 0x5e6f455aUL, 0x5fad2f6dUL, 0xe1351b80UL, 0xe0f771b7UL,
+    0xe2b1cfeeUL, 0xe373a5d9UL, 0xe63cb35cUL, 0xe7fed96bUL, 0xe5b86732UL,
+    0xe47a0d05UL, 0xef264a38UL, 0xeee4200fUL, 0xeca29e56UL, 0xed60f461UL,
+    0xe82fe2e4UL, 0xe9ed88d3UL, 0xebab368aUL, 0xea695cbdUL, 0xfd13b8f0UL,
+    0xfcd1d2c7UL, 0xfe976c9eUL, 0xff5506a9UL, 0xfa1a102cUL, 0xfbd87a1bUL,
+    0xf99ec442UL, 0xf85cae75UL, 0xf300e948UL, 0xf2c2837fUL, 0xf0843d26UL,
+    0xf1465711UL, 0xf4094194UL, 0xf5cb2ba3UL, 0xf78d95faUL, 0xf64fffcdUL,
+    0xd9785d60UL, 0xd8ba3757UL, 0xdafc890eUL, 0xdb3ee339UL, 0xde71f5bcUL,
+    0xdfb39f8bUL, 0xddf521d2UL, 0xdc374be5UL, 0xd76b0cd8UL, 0xd6a966efUL,
+    0xd4efd8b6UL, 0xd52db281UL, 0xd062a404UL, 0xd1a0ce33UL, 0xd3e6706aUL,
+    0xd2241a5dUL, 0xc55efe10UL, 0xc49c9427UL, 0xc6da2a7eUL, 0xc7184049UL,
+    0xc25756ccUL, 0xc3953cfbUL, 0xc1d382a2UL, 0xc011e895UL, 0xcb4dafa8UL,
+    0xca8fc59fUL, 0xc8c97bc6UL, 0xc90b11f1UL, 0xcc440774UL, 0xcd866d43UL,
+    0xcfc0d31aUL, 0xce02b92dUL, 0x91af9640UL, 0x906dfc77UL, 0x922b422eUL,
+    0x93e92819UL, 0x96a63e9cUL, 0x976454abUL, 0x9522eaf2UL, 0x94e080c5UL,
+    0x9fbcc7f8UL, 0x9e7eadcfUL, 0x9c381396UL, 0x9dfa79a1UL, 0x98b56f24UL,
+    0x99770513UL, 0x9b31bb4aUL, 0x9af3d17dUL, 0x8d893530UL, 0x8c4b5f07UL,
+    0x8e0de15eUL, 0x8fcf8b69UL, 0x8a809decUL, 0x8b42f7dbUL, 0x89044982UL,
+    0x88c623b5UL, 0x839a6488UL, 0x82580ebfUL, 0x801eb0e6UL, 0x81dcdad1UL,
+    0x8493cc54UL, 0x8551a663UL, 0x8717183aUL, 0x86d5720dUL, 0xa9e2d0a0UL,
+    0xa820ba97UL, 0xaa6604ceUL, 0xaba46ef9UL, 0xaeeb787cUL, 0xaf29124bUL,
+    0xad6fac12UL, 0xacadc625UL, 0xa7f18118UL, 0xa633eb2fUL, 0xa4755576UL,
+    0xa5b73f41UL, 0xa0f829c4UL, 0xa13a43f3UL, 0xa37cfdaaUL, 0xa2be979dUL,
+    0xb5c473d0UL, 0xb40619e7UL, 0xb640a7beUL, 0xb782cd89UL, 0xb2cddb0cUL,
+    0xb30fb13bUL, 0xb1490f62UL, 0xb08b6555UL, 0xbbd72268UL, 0xba15485fUL,
+    0xb853f606UL, 0xb9919c31UL, 0xbcde8ab4UL, 0xbd1ce083UL, 0xbf5a5edaUL,
+    0xbe9834edUL,
+
+    // Table 3
+    0x00000000UL, 0xb8bc6765UL, 0xaa09c88bUL, 0x12b5afeeUL, 0x8f629757UL,
+    0x37def032UL, 0x256b5fdcUL, 0x9dd738b9UL, 0xc5b428efUL, 0x7d084f8aUL,
+    0x6fbde064UL, 0xd7018701UL, 0x4ad6bfb8UL, 0xf26ad8ddUL, 0xe0df7733UL,
+    0x58631056UL, 0x5019579fUL, 0xe8a530faUL, 0xfa109f14UL, 0x42acf871UL,
+    0xdf7bc0c8UL, 0x67c7a7adUL, 0x75720843UL, 0xcdce6f26UL, 0x95ad7f70UL,
+    0x2d111815UL, 0x3fa4b7fbUL, 0x8718d09eUL, 0x1acfe827UL, 0xa2738f42UL,
+    0xb0c620acUL, 0x087a47c9UL, 0xa032af3eUL, 0x188ec85bUL, 0x0a3b67b5UL,
+    0xb28700d0UL, 0x2f503869UL, 0x97ec5f0cUL, 0x8559f0e2UL, 0x3de59787UL,
+    0x658687d1UL, 0xdd3ae0b4UL, 0xcf8f4f5aUL, 0x7733283fUL, 0xeae41086UL,
+    0x525877e3UL, 0x40edd80dUL, 0xf851bf68UL, 0xf02bf8a1UL, 0x48979fc4UL,
+    0x5a22302aUL, 0xe29e574fUL, 0x7f496ff6UL, 0xc7f50893UL, 0xd540a77dUL,
+    0x6dfcc018UL, 0x359fd04eUL, 0x8d23b72bUL, 0x9f9618c5UL, 0x272a7fa0UL,
+    0xbafd4719UL, 0x0241207cUL, 0x10f48f92UL, 0xa848e8f7UL, 0x9b14583dUL,
+    0x23a83f58UL, 0x311d90b6UL, 0x89a1f7d3UL, 0x1476cf6aUL, 0xaccaa80fUL,
+    0xbe7f07e1UL, 0x06c36084UL, 0x5ea070d2UL, 0xe61c17b7UL, 0xf4a9b859UL,
+    0x4c15df3cUL, 0xd1c2e785UL, 0x697e80e0UL, 0x7bcb2f0eUL, 0xc377486bUL,
+    0xcb0d0fa2UL, 0x73b168c7UL, 0x6104c729UL, 0xd9b8a04cUL, 0x446f98f5UL,
+    0xfcd3ff90UL, 0xee66507eUL, 0x56da371bUL, 0x0eb9274dUL, 0xb6054028UL,
+    0xa4b0efc6UL, 0x1c0c88a3UL, 0x81dbb01aUL, 0x3967d77fUL, 0x2bd27891UL,
+    0x936e1ff4UL, 0x3b26f703UL, 0x839a9066UL, 0x912f3f88UL, 0x299358edUL,
+    0xb4446054UL, 0x0cf80731UL, 0x1e4da8dfUL, 0xa6f1cfbaUL, 0xfe92dfecUL,
+    0x462eb889UL, 0x549b1767UL, 0xec277002UL, 0x71f048bbUL, 0xc94c2fdeUL,
+    0xdbf98030UL, 0x6345e755UL, 0x6b3fa09cUL, 0xd383c7f9UL, 0xc1366817UL,
+    0x798a0f72UL, 0xe45d37cbUL, 0x5ce150aeUL, 0x4e54ff40UL, 0xf6e89825UL,
+    0xae8b8873UL, 0x1637ef16UL, 0x048240f8UL, 0xbc3e279dUL, 0x21e91f24UL,
+    0x99557841UL, 0x8be0d7afUL, 0x335cb0caUL, 0xed59b63bUL, 0x55e5d15eUL,
+    0x47507eb0UL, 0xffec19d5UL, 0x623b216cUL, 0xda874609UL, 0xc832e9e7UL,
+    0x708e8e82UL, 0x28ed9ed4UL, 0x9051f9b1UL, 0x82e4565fUL, 0x3a58313aUL,
+    0xa78f0983UL, 0x1f336ee6UL, 0x0d86c108UL, 0xb53aa66dUL, 0xbd40e1a4UL,
+    0x05fc86c1UL, 0x1749292fUL, 0xaff54e4aUL, 0x322276f3UL, 0x8a9e1196UL,
+    0x982bbe78UL, 0x2097d91dUL, 0x78f4c94bUL, 0xc048ae2eUL, 0xd2fd01c0UL,
+    0x6a4166a5UL, 0xf7965e1cUL, 0x4f2a3979UL, 0x5d9f9697UL, 0xe523f1f2UL,
+    0x4d6b1905UL, 0xf5d77e60UL, 0xe762d18eUL, 0x5fdeb6ebUL, 0xc2098e52UL,
+    0x7ab5e937UL, 0x680046d9UL, 0xd0bc21bcUL, 0x88df31eaUL, 0x3063568fUL,
+    0x22d6f961UL, 0x9a6a9e04UL, 0x07bda6bdUL, 0xbf01c1d8UL, 0xadb46e36UL,
+    0x15080953UL, 0x1d724e9aUL, 0xa5ce29ffUL, 0xb77b8611UL, 0x0fc7e174UL,
+    0x9210d9cdUL, 0x2aacbea8UL, 0x38191146UL, 0x80a57623UL, 0xd8c66675UL,
+    0x607a0110UL, 0x72cfaefeUL, 0xca73c99bUL, 0x57a4f122UL, 0xef189647UL,
+    0xfdad39a9UL, 0x45115eccUL, 0x764dee06UL, 0xcef18963UL, 0xdc44268dUL,
+    0x64f841e8UL, 0xf92f7951UL, 0x41931e34UL, 0x5326b1daUL, 0xeb9ad6bfUL,
+    0xb3f9c6e9UL, 0x0b45a18cUL, 0x19f00e62UL, 0xa14c6907UL, 0x3c9b51beUL,
+    0x842736dbUL, 0x96929935UL, 0x2e2efe50UL, 0x2654b999UL, 0x9ee8defcUL,
+    0x8c5d7112UL, 0x34e11677UL, 0xa9362eceUL, 0x118a49abUL, 0x033fe645UL,
+    0xbb838120UL, 0xe3e09176UL, 0x5b5cf613UL, 0x49e959fdUL, 0xf1553e98UL,
+    0x6c820621UL, 0xd43e6144UL, 0xc68bceaaUL, 0x7e37a9cfUL, 0xd67f4138UL,
+    0x6ec3265dUL, 0x7c7689b3UL, 0xc4caeed6UL, 0x591dd66fUL, 0xe1a1b10aUL,
+    0xf3141ee4UL, 0x4ba87981UL, 0x13cb69d7UL, 0xab770eb2UL, 0xb9c2a15cUL,
+    0x017ec639UL, 0x9ca9fe80UL, 0x241599e5UL, 0x36a0360bUL, 0x8e1c516eUL,
+    0x866616a7UL, 0x3eda71c2UL, 0x2c6fde2cUL, 0x94d3b949UL, 0x090481f0UL,
+    0xb1b8e695UL, 0xa30d497bUL, 0x1bb12e1eUL, 0x43d23e48UL, 0xfb6e592dUL,
+    0xe9dbf6c3UL, 0x516791a6UL, 0xccb0a91fUL, 0x740cce7aUL, 0x66b96194UL,
+    0xde0506f1UL,
+    // Constants for Neon CRC32 implementation, 128-bit operation
+    // k3 = 0xba8ccbe8 = x^160 mod poly - bit reversed
+    // k4 = 0xa06a2517 = x^128 mod poly - bit reversed
+    // poly = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 + 0
+    0xba8ccbe8, 0xa06a2517,     // k4:k3
+    0x8cbae8cb, 0x6aa01725,     // byte swap
+    0xcbe8ba8c, 0x2517a06a,     // word swap
+    0xe8cb8cba, 0x17256aa0,     // byte swap of word swap
+};
+
+/**
+ * CRC32C constants lookup table
+ */
+
+juint StubRoutines::aarch32::_crc32c_table[] =
+{
+    // Table 0
+    0x00000000UL, 0xf26b8303UL, 0xe13b70f7UL, 0x1350f3f4UL, 0xc79a971fUL,
+    0x35f1141cUL, 0x26a1e7e8UL, 0xd4ca64ebUL, 0x8ad958cfUL, 0x78b2dbccUL,
+    0x6be22838UL, 0x9989ab3bUL, 0x4d43cfd0UL, 0xbf284cd3UL, 0xac78bf27UL,
+    0x5e133c24UL, 0x105ec76fUL, 0xe235446cUL, 0xf165b798UL, 0x030e349bUL,
+    0xd7c45070UL, 0x25afd373UL, 0x36ff2087UL, 0xc494a384UL, 0x9a879fa0UL,
+    0x68ec1ca3UL, 0x7bbcef57UL, 0x89d76c54UL, 0x5d1d08bfUL, 0xaf768bbcUL,
+    0xbc267848UL, 0x4e4dfb4bUL, 0x20bd8edeUL, 0xd2d60dddUL, 0xc186fe29UL,
+    0x33ed7d2aUL, 0xe72719c1UL, 0x154c9ac2UL, 0x061c6936UL, 0xf477ea35UL,
+    0xaa64d611UL, 0x580f5512UL, 0x4b5fa6e6UL, 0xb93425e5UL, 0x6dfe410eUL,
+    0x9f95c20dUL, 0x8cc531f9UL, 0x7eaeb2faUL, 0x30e349b1UL, 0xc288cab2UL,
+    0xd1d83946UL, 0x23b3ba45UL, 0xf779deaeUL, 0x05125dadUL, 0x1642ae59UL,
+    0xe4292d5aUL, 0xba3a117eUL, 0x4851927dUL, 0x5b016189UL, 0xa96ae28aUL,
+    0x7da08661UL, 0x8fcb0562UL, 0x9c9bf696UL, 0x6ef07595UL, 0x417b1dbcUL,
+    0xb3109ebfUL, 0xa0406d4bUL, 0x522bee48UL, 0x86e18aa3UL, 0x748a09a0UL,
+    0x67dafa54UL, 0x95b17957UL, 0xcba24573UL, 0x39c9c670UL, 0x2a993584UL,
+    0xd8f2b687UL, 0x0c38d26cUL, 0xfe53516fUL, 0xed03a29bUL, 0x1f682198UL,
+    0x5125dad3UL, 0xa34e59d0UL, 0xb01eaa24UL, 0x42752927UL, 0x96bf4dccUL,
+    0x64d4cecfUL, 0x77843d3bUL, 0x85efbe38UL, 0xdbfc821cUL, 0x2997011fUL,
+    0x3ac7f2ebUL, 0xc8ac71e8UL, 0x1c661503UL, 0xee0d9600UL, 0xfd5d65f4UL,
+    0x0f36e6f7UL, 0x61c69362UL, 0x93ad1061UL, 0x80fde395UL, 0x72966096UL,
+    0xa65c047dUL, 0x5437877eUL, 0x4767748aUL, 0xb50cf789UL, 0xeb1fcbadUL,
+    0x197448aeUL, 0x0a24bb5aUL, 0xf84f3859UL, 0x2c855cb2UL, 0xdeeedfb1UL,
+    0xcdbe2c45UL, 0x3fd5af46UL, 0x7198540dUL, 0x83f3d70eUL, 0x90a324faUL,
+    0x62c8a7f9UL, 0xb602c312UL, 0x44694011UL, 0x5739b3e5UL, 0xa55230e6UL,
+    0xfb410cc2UL, 0x092a8fc1UL, 0x1a7a7c35UL, 0xe811ff36UL, 0x3cdb9bddUL,
+    0xceb018deUL, 0xdde0eb2aUL, 0x2f8b6829UL, 0x82f63b78UL, 0x709db87bUL,
+    0x63cd4b8fUL, 0x91a6c88cUL, 0x456cac67UL, 0xb7072f64UL, 0xa457dc90UL,
+    0x563c5f93UL, 0x082f63b7UL, 0xfa44e0b4UL, 0xe9141340UL, 0x1b7f9043UL,
+    0xcfb5f4a8UL, 0x3dde77abUL, 0x2e8e845fUL, 0xdce5075cUL, 0x92a8fc17UL,
+    0x60c37f14UL, 0x73938ce0UL, 0x81f80fe3UL, 0x55326b08UL, 0xa759e80bUL,
+    0xb4091bffUL, 0x466298fcUL, 0x1871a4d8UL, 0xea1a27dbUL, 0xf94ad42fUL,
+    0x0b21572cUL, 0xdfeb33c7UL, 0x2d80b0c4UL, 0x3ed04330UL, 0xccbbc033UL,
+    0xa24bb5a6UL, 0x502036a5UL, 0x4370c551UL, 0xb11b4652UL, 0x65d122b9UL,
+    0x97baa1baUL, 0x84ea524eUL, 0x7681d14dUL, 0x2892ed69UL, 0xdaf96e6aUL,
+    0xc9a99d9eUL, 0x3bc21e9dUL, 0xef087a76UL, 0x1d63f975UL, 0x0e330a81UL,
+    0xfc588982UL, 0xb21572c9UL, 0x407ef1caUL, 0x532e023eUL, 0xa145813dUL,
+    0x758fe5d6UL, 0x87e466d5UL, 0x94b49521UL, 0x66df1622UL, 0x38cc2a06UL,
+    0xcaa7a905UL, 0xd9f75af1UL, 0x2b9cd9f2UL, 0xff56bd19UL, 0x0d3d3e1aUL,
+    0x1e6dcdeeUL, 0xec064eedUL, 0xc38d26c4UL, 0x31e6a5c7UL, 0x22b65633UL,
+    0xd0ddd530UL, 0x0417b1dbUL, 0xf67c32d8UL, 0xe52cc12cUL, 0x1747422fUL,
+    0x49547e0bUL, 0xbb3ffd08UL, 0xa86f0efcUL, 0x5a048dffUL, 0x8ecee914UL,
+    0x7ca56a17UL, 0x6ff599e3UL, 0x9d9e1ae0UL, 0xd3d3e1abUL, 0x21b862a8UL,
+    0x32e8915cUL, 0xc083125fUL, 0x144976b4UL, 0xe622f5b7UL, 0xf5720643UL,
+    0x07198540UL, 0x590ab964UL, 0xab613a67UL, 0xb831c993UL, 0x4a5a4a90UL,
+    0x9e902e7bUL, 0x6cfbad78UL, 0x7fab5e8cUL, 0x8dc0dd8fUL, 0xe330a81aUL,
+    0x115b2b19UL, 0x020bd8edUL, 0xf0605beeUL, 0x24aa3f05UL, 0xd6c1bc06UL,
+    0xc5914ff2UL, 0x37faccf1UL, 0x69e9f0d5UL, 0x9b8273d6UL, 0x88d28022UL,
+    0x7ab90321UL, 0xae7367caUL, 0x5c18e4c9UL, 0x4f48173dUL, 0xbd23943eUL,
+    0xf36e6f75UL, 0x0105ec76UL, 0x12551f82UL, 0xe03e9c81UL, 0x34f4f86aUL,
+    0xc69f7b69UL, 0xd5cf889dUL, 0x27a40b9eUL, 0x79b737baUL, 0x8bdcb4b9UL,
+    0x988c474dUL, 0x6ae7c44eUL, 0xbe2da0a5UL, 0x4c4623a6UL, 0x5f16d052UL,
+    0xad7d5351UL,
+
+    // Table 1
+    0x00000000UL, 0x13a29877UL, 0x274530eeUL, 0x34e7a899UL, 0x4e8a61dcUL,
+    0x5d28f9abUL, 0x69cf5132UL, 0x7a6dc945UL, 0x9d14c3b8UL, 0x8eb65bcfUL,
+    0xba51f356UL, 0xa9f36b21UL, 0xd39ea264UL, 0xc03c3a13UL, 0xf4db928aUL,
+    0xe7790afdUL, 0x3fc5f181UL, 0x2c6769f6UL, 0x1880c16fUL, 0x0b225918UL,
+    0x714f905dUL, 0x62ed082aUL, 0x560aa0b3UL, 0x45a838c4UL, 0xa2d13239UL,
+    0xb173aa4eUL, 0x859402d7UL, 0x96369aa0UL, 0xec5b53e5UL, 0xfff9cb92UL,
+    0xcb1e630bUL, 0xd8bcfb7cUL, 0x7f8be302UL, 0x6c297b75UL, 0x58ced3ecUL,
+    0x4b6c4b9bUL, 0x310182deUL, 0x22a31aa9UL, 0x1644b230UL, 0x05e62a47UL,
+    0xe29f20baUL, 0xf13db8cdUL, 0xc5da1054UL, 0xd6788823UL, 0xac154166UL,
+    0xbfb7d911UL, 0x8b507188UL, 0x98f2e9ffUL, 0x404e1283UL, 0x53ec8af4UL,
+    0x670b226dUL, 0x74a9ba1aUL, 0x0ec4735fUL, 0x1d66eb28UL, 0x298143b1UL,
+    0x3a23dbc6UL, 0xdd5ad13bUL, 0xcef8494cUL, 0xfa1fe1d5UL, 0xe9bd79a2UL,
+    0x93d0b0e7UL, 0x80722890UL, 0xb4958009UL, 0xa737187eUL, 0xff17c604UL,
+    0xecb55e73UL, 0xd852f6eaUL, 0xcbf06e9dUL, 0xb19da7d8UL, 0xa23f3fafUL,
+    0x96d89736UL, 0x857a0f41UL, 0x620305bcUL, 0x71a19dcbUL, 0x45463552UL,
+    0x56e4ad25UL, 0x2c896460UL, 0x3f2bfc17UL, 0x0bcc548eUL, 0x186eccf9UL,
+    0xc0d23785UL, 0xd370aff2UL, 0xe797076bUL, 0xf4359f1cUL, 0x8e585659UL,
+    0x9dface2eUL, 0xa91d66b7UL, 0xbabffec0UL, 0x5dc6f43dUL, 0x4e646c4aUL,
+    0x7a83c4d3UL, 0x69215ca4UL, 0x134c95e1UL, 0x00ee0d96UL, 0x3409a50fUL,
+    0x27ab3d78UL, 0x809c2506UL, 0x933ebd71UL, 0xa7d915e8UL, 0xb47b8d9fUL,
+    0xce1644daUL, 0xddb4dcadUL, 0xe9537434UL, 0xfaf1ec43UL, 0x1d88e6beUL,
+    0x0e2a7ec9UL, 0x3acdd650UL, 0x296f4e27UL, 0x53028762UL, 0x40a01f15UL,
+    0x7447b78cUL, 0x67e52ffbUL, 0xbf59d487UL, 0xacfb4cf0UL, 0x981ce469UL,
+    0x8bbe7c1eUL, 0xf1d3b55bUL, 0xe2712d2cUL, 0xd69685b5UL, 0xc5341dc2UL,
+    0x224d173fUL, 0x31ef8f48UL, 0x050827d1UL, 0x16aabfa6UL, 0x6cc776e3UL,
+    0x7f65ee94UL, 0x4b82460dUL, 0x5820de7aUL, 0xfbc3faf9UL, 0xe861628eUL,
+    0xdc86ca17UL, 0xcf245260UL, 0xb5499b25UL, 0xa6eb0352UL, 0x920cabcbUL,
+    0x81ae33bcUL, 0x66d73941UL, 0x7575a136UL, 0x419209afUL, 0x523091d8UL,
+    0x285d589dUL, 0x3bffc0eaUL, 0x0f186873UL, 0x1cbaf004UL, 0xc4060b78UL,
+    0xd7a4930fUL, 0xe3433b96UL, 0xf0e1a3e1UL, 0x8a8c6aa4UL, 0x992ef2d3UL,
+    0xadc95a4aUL, 0xbe6bc23dUL, 0x5912c8c0UL, 0x4ab050b7UL, 0x7e57f82eUL,
+    0x6df56059UL, 0x1798a91cUL, 0x043a316bUL, 0x30dd99f2UL, 0x237f0185UL,
+    0x844819fbUL, 0x97ea818cUL, 0xa30d2915UL, 0xb0afb162UL, 0xcac27827UL,
+    0xd960e050UL, 0xed8748c9UL, 0xfe25d0beUL, 0x195cda43UL, 0x0afe4234UL,
+    0x3e19eaadUL, 0x2dbb72daUL, 0x57d6bb9fUL, 0x447423e8UL, 0x70938b71UL,
+    0x63311306UL, 0xbb8de87aUL, 0xa82f700dUL, 0x9cc8d894UL, 0x8f6a40e3UL,
+    0xf50789a6UL, 0xe6a511d1UL, 0xd242b948UL, 0xc1e0213fUL, 0x26992bc2UL,
+    0x353bb3b5UL, 0x01dc1b2cUL, 0x127e835bUL, 0x68134a1eUL, 0x7bb1d269UL,
+    0x4f567af0UL, 0x5cf4e287UL, 0x04d43cfdUL, 0x1776a48aUL, 0x23910c13UL,
+    0x30339464UL, 0x4a5e5d21UL, 0x59fcc556UL, 0x6d1b6dcfUL, 0x7eb9f5b8UL,
+    0x99c0ff45UL, 0x8a626732UL, 0xbe85cfabUL, 0xad2757dcUL, 0xd74a9e99UL,
+    0xc4e806eeUL, 0xf00fae77UL, 0xe3ad3600UL, 0x3b11cd7cUL, 0x28b3550bUL,
+    0x1c54fd92UL, 0x0ff665e5UL, 0x759baca0UL, 0x663934d7UL, 0x52de9c4eUL,
+    0x417c0439UL, 0xa6050ec4UL, 0xb5a796b3UL, 0x81403e2aUL, 0x92e2a65dUL,
+    0xe88f6f18UL, 0xfb2df76fUL, 0xcfca5ff6UL, 0xdc68c781UL, 0x7b5fdfffUL,
+    0x68fd4788UL, 0x5c1aef11UL, 0x4fb87766UL, 0x35d5be23UL, 0x26772654UL,
+    0x12908ecdUL, 0x013216baUL, 0xe64b1c47UL, 0xf5e98430UL, 0xc10e2ca9UL,
+    0xd2acb4deUL, 0xa8c17d9bUL, 0xbb63e5ecUL, 0x8f844d75UL, 0x9c26d502UL,
+    0x449a2e7eUL, 0x5738b609UL, 0x63df1e90UL, 0x707d86e7UL, 0x0a104fa2UL,
+    0x19b2d7d5UL, 0x2d557f4cUL, 0x3ef7e73bUL, 0xd98eedc6UL, 0xca2c75b1UL,
+    0xfecbdd28UL, 0xed69455fUL, 0x97048c1aUL, 0x84a6146dUL, 0xb041bcf4UL,
+    0xa3e32483UL,
+
+    // Table 2
+    0x00000000UL, 0xa541927eUL, 0x4f6f520dUL, 0xea2ec073UL, 0x9edea41aUL,
+    0x3b9f3664UL, 0xd1b1f617UL, 0x74f06469UL, 0x38513ec5UL, 0x9d10acbbUL,
+    0x773e6cc8UL, 0xd27ffeb6UL, 0xa68f9adfUL, 0x03ce08a1UL, 0xe9e0c8d2UL,
+    0x4ca15aacUL, 0x70a27d8aUL, 0xd5e3eff4UL, 0x3fcd2f87UL, 0x9a8cbdf9UL,
+    0xee7cd990UL, 0x4b3d4beeUL, 0xa1138b9dUL, 0x045219e3UL, 0x48f3434fUL,
+    0xedb2d131UL, 0x079c1142UL, 0xa2dd833cUL, 0xd62de755UL, 0x736c752bUL,
+    0x9942b558UL, 0x3c032726UL, 0xe144fb14UL, 0x4405696aUL, 0xae2ba919UL,
+    0x0b6a3b67UL, 0x7f9a5f0eUL, 0xdadbcd70UL, 0x30f50d03UL, 0x95b49f7dUL,
+    0xd915c5d1UL, 0x7c5457afUL, 0x967a97dcUL, 0x333b05a2UL, 0x47cb61cbUL,
+    0xe28af3b5UL, 0x08a433c6UL, 0xade5a1b8UL, 0x91e6869eUL, 0x34a714e0UL,
+    0xde89d493UL, 0x7bc846edUL, 0x0f382284UL, 0xaa79b0faUL, 0x40577089UL,
+    0xe516e2f7UL, 0xa9b7b85bUL, 0x0cf62a25UL, 0xe6d8ea56UL, 0x43997828UL,
+    0x37691c41UL, 0x92288e3fUL, 0x78064e4cUL, 0xdd47dc32UL, 0xc76580d9UL,
+    0x622412a7UL, 0x880ad2d4UL, 0x2d4b40aaUL, 0x59bb24c3UL, 0xfcfab6bdUL,
+    0x16d476ceUL, 0xb395e4b0UL, 0xff34be1cUL, 0x5a752c62UL, 0xb05bec11UL,
+    0x151a7e6fUL, 0x61ea1a06UL, 0xc4ab8878UL, 0x2e85480bUL, 0x8bc4da75UL,
+    0xb7c7fd53UL, 0x12866f2dUL, 0xf8a8af5eUL, 0x5de93d20UL, 0x29195949UL,
+    0x8c58cb37UL, 0x66760b44UL, 0xc337993aUL, 0x8f96c396UL, 0x2ad751e8UL,
+    0xc0f9919bUL, 0x65b803e5UL, 0x1148678cUL, 0xb409f5f2UL, 0x5e273581UL,
+    0xfb66a7ffUL, 0x26217bcdUL, 0x8360e9b3UL, 0x694e29c0UL, 0xcc0fbbbeUL,
+    0xb8ffdfd7UL, 0x1dbe4da9UL, 0xf7908ddaUL, 0x52d11fa4UL, 0x1e704508UL,
+    0xbb31d776UL, 0x511f1705UL, 0xf45e857bUL, 0x80aee112UL, 0x25ef736cUL,
+    0xcfc1b31fUL, 0x6a802161UL, 0x56830647UL, 0xf3c29439UL, 0x19ec544aUL,
+    0xbcadc634UL, 0xc85da25dUL, 0x6d1c3023UL, 0x8732f050UL, 0x2273622eUL,
+    0x6ed23882UL, 0xcb93aafcUL, 0x21bd6a8fUL, 0x84fcf8f1UL, 0xf00c9c98UL,
+    0x554d0ee6UL, 0xbf63ce95UL, 0x1a225cebUL, 0x8b277743UL, 0x2e66e53dUL,
+    0xc448254eUL, 0x6109b730UL, 0x15f9d359UL, 0xb0b84127UL, 0x5a968154UL,
+    0xffd7132aUL, 0xb3764986UL, 0x1637dbf8UL, 0xfc191b8bUL, 0x595889f5UL,
+    0x2da8ed9cUL, 0x88e97fe2UL, 0x62c7bf91UL, 0xc7862defUL, 0xfb850ac9UL,
+    0x5ec498b7UL, 0xb4ea58c4UL, 0x11abcabaUL, 0x655baed3UL, 0xc01a3cadUL,
+    0x2a34fcdeUL, 0x8f756ea0UL, 0xc3d4340cUL, 0x6695a672UL, 0x8cbb6601UL,
+    0x29faf47fUL, 0x5d0a9016UL, 0xf84b0268UL, 0x1265c21bUL, 0xb7245065UL,
+    0x6a638c57UL, 0xcf221e29UL, 0x250cde5aUL, 0x804d4c24UL, 0xf4bd284dUL,
+    0x51fcba33UL, 0xbbd27a40UL, 0x1e93e83eUL, 0x5232b292UL, 0xf77320ecUL,
+    0x1d5de09fUL, 0xb81c72e1UL, 0xccec1688UL, 0x69ad84f6UL, 0x83834485UL,
+    0x26c2d6fbUL, 0x1ac1f1ddUL, 0xbf8063a3UL, 0x55aea3d0UL, 0xf0ef31aeUL,
+    0x841f55c7UL, 0x215ec7b9UL, 0xcb7007caUL, 0x6e3195b4UL, 0x2290cf18UL,
+    0x87d15d66UL, 0x6dff9d15UL, 0xc8be0f6bUL, 0xbc4e6b02UL, 0x190ff97cUL,
+    0xf321390fUL, 0x5660ab71UL, 0x4c42f79aUL, 0xe90365e4UL, 0x032da597UL,
+    0xa66c37e9UL, 0xd29c5380UL, 0x77ddc1feUL, 0x9df3018dUL, 0x38b293f3UL,
+    0x7413c95fUL, 0xd1525b21UL, 0x3b7c9b52UL, 0x9e3d092cUL, 0xeacd6d45UL,
+    0x4f8cff3bUL, 0xa5a23f48UL, 0x00e3ad36UL, 0x3ce08a10UL, 0x99a1186eUL,
+    0x738fd81dUL, 0xd6ce4a63UL, 0xa23e2e0aUL, 0x077fbc74UL, 0xed517c07UL,
+    0x4810ee79UL, 0x04b1b4d5UL, 0xa1f026abUL, 0x4bdee6d8UL, 0xee9f74a6UL,
+    0x9a6f10cfUL, 0x3f2e82b1UL, 0xd50042c2UL, 0x7041d0bcUL, 0xad060c8eUL,
+    0x08479ef0UL, 0xe2695e83UL, 0x4728ccfdUL, 0x33d8a894UL, 0x96993aeaUL,
+    0x7cb7fa99UL, 0xd9f668e7UL, 0x9557324bUL, 0x3016a035UL, 0xda386046UL,
+    0x7f79f238UL, 0x0b899651UL, 0xaec8042fUL, 0x44e6c45cUL, 0xe1a75622UL,
+    0xdda47104UL, 0x78e5e37aUL, 0x92cb2309UL, 0x378ab177UL, 0x437ad51eUL,
+    0xe63b4760UL, 0x0c158713UL, 0xa954156dUL, 0xe5f54fc1UL, 0x40b4ddbfUL,
+    0xaa9a1dccUL, 0x0fdb8fb2UL, 0x7b2bebdbUL, 0xde6a79a5UL, 0x3444b9d6UL,
+    0x91052ba8UL,
+
+    // Table 3
+    0x00000000UL, 0xdd45aab8UL, 0xbf672381UL, 0x62228939UL, 0x7b2231f3UL,
+    0xa6679b4bUL, 0xc4451272UL, 0x1900b8caUL, 0xf64463e6UL, 0x2b01c95eUL,
+    0x49234067UL, 0x9466eadfUL, 0x8d665215UL, 0x5023f8adUL, 0x32017194UL,
+    0xef44db2cUL, 0xe964b13dUL, 0x34211b85UL, 0x560392bcUL, 0x8b463804UL,
+    0x924680ceUL, 0x4f032a76UL, 0x2d21a34fUL, 0xf06409f7UL, 0x1f20d2dbUL,
+    0xc2657863UL, 0xa047f15aUL, 0x7d025be2UL, 0x6402e328UL, 0xb9474990UL,
+    0xdb65c0a9UL, 0x06206a11UL, 0xd725148bUL, 0x0a60be33UL, 0x6842370aUL,
+    0xb5079db2UL, 0xac072578UL, 0x71428fc0UL, 0x136006f9UL, 0xce25ac41UL,
+    0x2161776dUL, 0xfc24ddd5UL, 0x9e0654ecUL, 0x4343fe54UL, 0x5a43469eUL,
+    0x8706ec26UL, 0xe524651fUL, 0x3861cfa7UL, 0x3e41a5b6UL, 0xe3040f0eUL,
+    0x81268637UL, 0x5c632c8fUL, 0x45639445UL, 0x98263efdUL, 0xfa04b7c4UL,
+    0x27411d7cUL, 0xc805c650UL, 0x15406ce8UL, 0x7762e5d1UL, 0xaa274f69UL,
+    0xb327f7a3UL, 0x6e625d1bUL, 0x0c40d422UL, 0xd1057e9aUL, 0xaba65fe7UL,
+    0x76e3f55fUL, 0x14c17c66UL, 0xc984d6deUL, 0xd0846e14UL, 0x0dc1c4acUL,
+    0x6fe34d95UL, 0xb2a6e72dUL, 0x5de23c01UL, 0x80a796b9UL, 0xe2851f80UL,
+    0x3fc0b538UL, 0x26c00df2UL, 0xfb85a74aUL, 0x99a72e73UL, 0x44e284cbUL,
+    0x42c2eedaUL, 0x9f874462UL, 0xfda5cd5bUL, 0x20e067e3UL, 0x39e0df29UL,
+    0xe4a57591UL, 0x8687fca8UL, 0x5bc25610UL, 0xb4868d3cUL, 0x69c32784UL,
+    0x0be1aebdUL, 0xd6a40405UL, 0xcfa4bccfUL, 0x12e11677UL, 0x70c39f4eUL,
+    0xad8635f6UL, 0x7c834b6cUL, 0xa1c6e1d4UL, 0xc3e468edUL, 0x1ea1c255UL,
+    0x07a17a9fUL, 0xdae4d027UL, 0xb8c6591eUL, 0x6583f3a6UL, 0x8ac7288aUL,
+    0x57828232UL, 0x35a00b0bUL, 0xe8e5a1b3UL, 0xf1e51979UL, 0x2ca0b3c1UL,
+    0x4e823af8UL, 0x93c79040UL, 0x95e7fa51UL, 0x48a250e9UL, 0x2a80d9d0UL,
+    0xf7c57368UL, 0xeec5cba2UL, 0x3380611aUL, 0x51a2e823UL, 0x8ce7429bUL,
+    0x63a399b7UL, 0xbee6330fUL, 0xdcc4ba36UL, 0x0181108eUL, 0x1881a844UL,
+    0xc5c402fcUL, 0xa7e68bc5UL, 0x7aa3217dUL, 0x52a0c93fUL, 0x8fe56387UL,
+    0xedc7eabeUL, 0x30824006UL, 0x2982f8ccUL, 0xf4c75274UL, 0x96e5db4dUL,
+    0x4ba071f5UL, 0xa4e4aad9UL, 0x79a10061UL, 0x1b838958UL, 0xc6c623e0UL,
+    0xdfc69b2aUL, 0x02833192UL, 0x60a1b8abUL, 0xbde41213UL, 0xbbc47802UL,
+    0x6681d2baUL, 0x04a35b83UL, 0xd9e6f13bUL, 0xc0e649f1UL, 0x1da3e349UL,
+    0x7f816a70UL, 0xa2c4c0c8UL, 0x4d801be4UL, 0x90c5b15cUL, 0xf2e73865UL,
+    0x2fa292ddUL, 0x36a22a17UL, 0xebe780afUL, 0x89c50996UL, 0x5480a32eUL,
+    0x8585ddb4UL, 0x58c0770cUL, 0x3ae2fe35UL, 0xe7a7548dUL, 0xfea7ec47UL,
+    0x23e246ffUL, 0x41c0cfc6UL, 0x9c85657eUL, 0x73c1be52UL, 0xae8414eaUL,
+    0xcca69dd3UL, 0x11e3376bUL, 0x08e38fa1UL, 0xd5a62519UL, 0xb784ac20UL,
+    0x6ac10698UL, 0x6ce16c89UL, 0xb1a4c631UL, 0xd3864f08UL, 0x0ec3e5b0UL,
+    0x17c35d7aUL, 0xca86f7c2UL, 0xa8a47efbUL, 0x75e1d443UL, 0x9aa50f6fUL,
+    0x47e0a5d7UL, 0x25c22ceeUL, 0xf8878656UL, 0xe1873e9cUL, 0x3cc29424UL,
+    0x5ee01d1dUL, 0x83a5b7a5UL, 0xf90696d8UL, 0x24433c60UL, 0x4661b559UL,
+    0x9b241fe1UL, 0x8224a72bUL, 0x5f610d93UL, 0x3d4384aaUL, 0xe0062e12UL,
+    0x0f42f53eUL, 0xd2075f86UL, 0xb025d6bfUL, 0x6d607c07UL, 0x7460c4cdUL,
+    0xa9256e75UL, 0xcb07e74cUL, 0x16424df4UL, 0x106227e5UL, 0xcd278d5dUL,
+    0xaf050464UL, 0x7240aedcUL, 0x6b401616UL, 0xb605bcaeUL, 0xd4273597UL,
+    0x09629f2fUL, 0xe6264403UL, 0x3b63eebbUL, 0x59416782UL, 0x8404cd3aUL,
+    0x9d0475f0UL, 0x4041df48UL, 0x22635671UL, 0xff26fcc9UL, 0x2e238253UL,
+    0xf36628ebUL, 0x9144a1d2UL, 0x4c010b6aUL, 0x5501b3a0UL, 0x88441918UL,
+    0xea669021UL, 0x37233a99UL, 0xd867e1b5UL, 0x05224b0dUL, 0x6700c234UL,
+    0xba45688cUL, 0xa345d046UL, 0x7e007afeUL, 0x1c22f3c7UL, 0xc167597fUL,
+    0xc747336eUL, 0x1a0299d6UL, 0x782010efUL, 0xa565ba57UL, 0xbc65029dUL,
+    0x6120a825UL, 0x0302211cUL, 0xde478ba4UL, 0x31035088UL, 0xec46fa30UL,
+    0x8e647309UL, 0x5321d9b1UL, 0x4a21617bUL, 0x9764cbc3UL, 0xf54642faUL,
+    0x2803e842UL,
+    // Constants for Neon CRC32C implementation, 128-bit operation
+    // k3 = 0x790606ff = x^160 mod poly - bit reversed
+    // k4 = 0x18b8ea18 = x^128 mod poly - bit reversed
+    // poly = x^32 + x^28 + x^27 + x^26 + x^25 + x^23 + x^22 + x^20 + x^19 + x^18 + x^14 + x^13 + x^11 + x^10 + x^9 + x^8 + x^6 + x^0 + 0
+    0x790606ff, 0x18b8ea18,     // k4:k3
+    0x0679ff06, 0xb81818ea,     // byte swap
+    0x06ff7906, 0xea1818b8,     // word swap
+    0xff060679, 0x18eab818,     // byte swap of word swap
+};
+
+
+/**
+ *  AES_TE[] table for AES encryption
+ */
+juint StubRoutines::aarch32::_aes_te_table[]
+                   __attribute__ ((aligned(2048))) =
+{
+    //T
+    0xc66363a5UL, 0xf87c7c84UL, 0xee777799UL, 0xf67b7b8dUL,
+    0xfff2f20dUL, 0xd66b6bbdUL, 0xde6f6fb1UL, 0x91c5c554UL,
+    0x60303050UL, 0x02010103UL, 0xce6767a9UL, 0x562b2b7dUL,
+    0xe7fefe19UL, 0xb5d7d762UL, 0x4dababe6UL, 0xec76769aUL,
+    0x8fcaca45UL, 0x1f82829dUL, 0x89c9c940UL, 0xfa7d7d87UL,
+    0xeffafa15UL, 0xb25959ebUL, 0x8e4747c9UL, 0xfbf0f00bUL,
+    0x41adadecUL, 0xb3d4d467UL, 0x5fa2a2fdUL, 0x45afafeaUL,
+    0x239c9cbfUL, 0x53a4a4f7UL, 0xe4727296UL, 0x9bc0c05bUL,
+    0x75b7b7c2UL, 0xe1fdfd1cUL, 0x3d9393aeUL, 0x4c26266aUL,
+    0x6c36365aUL, 0x7e3f3f41UL, 0xf5f7f702UL, 0x83cccc4fUL,
+    0x6834345cUL, 0x51a5a5f4UL, 0xd1e5e534UL, 0xf9f1f108UL,
+    0xe2717193UL, 0xabd8d873UL, 0x62313153UL, 0x2a15153fUL,
+    0x0804040cUL, 0x95c7c752UL, 0x46232365UL, 0x9dc3c35eUL,
+    0x30181828UL, 0x379696a1UL, 0x0a05050fUL, 0x2f9a9ab5UL,
+    0x0e070709UL, 0x24121236UL, 0x1b80809bUL, 0xdfe2e23dUL,
+    0xcdebeb26UL, 0x4e272769UL, 0x7fb2b2cdUL, 0xea75759fUL,
+    0x1209091bUL, 0x1d83839eUL, 0x582c2c74UL, 0x341a1a2eUL,
+    0x361b1b2dUL, 0xdc6e6eb2UL, 0xb45a5aeeUL, 0x5ba0a0fbUL,
+    0xa45252f6UL, 0x763b3b4dUL, 0xb7d6d661UL, 0x7db3b3ceUL,
+    0x5229297bUL, 0xdde3e33eUL, 0x5e2f2f71UL, 0x13848497UL,
+    0xa65353f5UL, 0xb9d1d168UL, 0x00000000UL, 0xc1eded2cUL,
+    0x40202060UL, 0xe3fcfc1fUL, 0x79b1b1c8UL, 0xb65b5bedUL,
+    0xd46a6abeUL, 0x8dcbcb46UL, 0x67bebed9UL, 0x7239394bUL,
+    0x944a4adeUL, 0x984c4cd4UL, 0xb05858e8UL, 0x85cfcf4aUL,
+    0xbbd0d06bUL, 0xc5efef2aUL, 0x4faaaae5UL, 0xedfbfb16UL,
+    0x864343c5UL, 0x9a4d4dd7UL, 0x66333355UL, 0x11858594UL,
+    0x8a4545cfUL, 0xe9f9f910UL, 0x04020206UL, 0xfe7f7f81UL,
+    0xa05050f0UL, 0x783c3c44UL, 0x259f9fbaUL, 0x4ba8a8e3UL,
+    0xa25151f3UL, 0x5da3a3feUL, 0x804040c0UL, 0x058f8f8aUL,
+    0x3f9292adUL, 0x219d9dbcUL, 0x70383848UL, 0xf1f5f504UL,
+    0x63bcbcdfUL, 0x77b6b6c1UL, 0xafdada75UL, 0x42212163UL,
+    0x20101030UL, 0xe5ffff1aUL, 0xfdf3f30eUL, 0xbfd2d26dUL,
+    0x81cdcd4cUL, 0x180c0c14UL, 0x26131335UL, 0xc3ecec2fUL,
+    0xbe5f5fe1UL, 0x359797a2UL, 0x884444ccUL, 0x2e171739UL,
+    0x93c4c457UL, 0x55a7a7f2UL, 0xfc7e7e82UL, 0x7a3d3d47UL,
+    0xc86464acUL, 0xba5d5de7UL, 0x3219192bUL, 0xe6737395UL,
+    0xc06060a0UL, 0x19818198UL, 0x9e4f4fd1UL, 0xa3dcdc7fUL,
+    0x44222266UL, 0x542a2a7eUL, 0x3b9090abUL, 0x0b888883UL,
+    0x8c4646caUL, 0xc7eeee29UL, 0x6bb8b8d3UL, 0x2814143cUL,
+    0xa7dede79UL, 0xbc5e5ee2UL, 0x160b0b1dUL, 0xaddbdb76UL,
+    0xdbe0e03bUL, 0x64323256UL, 0x743a3a4eUL, 0x140a0a1eUL,
+    0x924949dbUL, 0x0c06060aUL, 0x4824246cUL, 0xb85c5ce4UL,
+    0x9fc2c25dUL, 0xbdd3d36eUL, 0x43acacefUL, 0xc46262a6UL,
+    0x399191a8UL, 0x319595a4UL, 0xd3e4e437UL, 0xf279798bUL,
+    0xd5e7e732UL, 0x8bc8c843UL, 0x6e373759UL, 0xda6d6db7UL,
+    0x018d8d8cUL, 0xb1d5d564UL, 0x9c4e4ed2UL, 0x49a9a9e0UL,
+    0xd86c6cb4UL, 0xac5656faUL, 0xf3f4f407UL, 0xcfeaea25UL,
+    0xca6565afUL, 0xf47a7a8eUL, 0x47aeaee9UL, 0x10080818UL,
+    0x6fbabad5UL, 0xf0787888UL, 0x4a25256fUL, 0x5c2e2e72UL,
+    0x381c1c24UL, 0x57a6a6f1UL, 0x73b4b4c7UL, 0x97c6c651UL,
+    0xcbe8e823UL, 0xa1dddd7cUL, 0xe874749cUL, 0x3e1f1f21UL,
+    0x964b4bddUL, 0x61bdbddcUL, 0x0d8b8b86UL, 0x0f8a8a85UL,
+    0xe0707090UL, 0x7c3e3e42UL, 0x71b5b5c4UL, 0xcc6666aaUL,
+    0x904848d8UL, 0x06030305UL, 0xf7f6f601UL, 0x1c0e0e12UL,
+    0xc26161a3UL, 0x6a35355fUL, 0xae5757f9UL, 0x69b9b9d0UL,
+    0x17868691UL, 0x99c1c158UL, 0x3a1d1d27UL, 0x279e9eb9UL,
+    0xd9e1e138UL, 0xebf8f813UL, 0x2b9898b3UL, 0x22111133UL,
+    0xd26969bbUL, 0xa9d9d970UL, 0x078e8e89UL, 0x339494a7UL,
+    0x2d9b9bb6UL, 0x3c1e1e22UL, 0x15878792UL, 0xc9e9e920UL,
+    0x87cece49UL, 0xaa5555ffUL, 0x50282878UL, 0xa5dfdf7aUL,
+    0x038c8c8fUL, 0x59a1a1f8UL, 0x09898980UL, 0x1a0d0d17UL,
+    0x65bfbfdaUL, 0xd7e6e631UL, 0x844242c6UL, 0xd06868b8UL,
+    0x824141c3UL, 0x299999b0UL, 0x5a2d2d77UL, 0x1e0f0f11UL,
+    0x7bb0b0cbUL, 0xa85454fcUL, 0x6dbbbbd6UL, 0x2c16163aUL,
+    //S
+    0x63UL, 0x7cUL, 0x77UL, 0x7bUL, 0xf2UL, 0x6bUL, 0x6fUL, 0xc5UL,
+    0x30UL, 0x01UL, 0x67UL, 0x2bUL, 0xfeUL, 0xd7UL, 0xabUL, 0x76UL,
+    0xcaUL, 0x82UL, 0xc9UL, 0x7dUL, 0xfaUL, 0x59UL, 0x47UL, 0xf0UL,
+    0xadUL, 0xd4UL, 0xa2UL, 0xafUL, 0x9cUL, 0xa4UL, 0x72UL, 0xc0UL,
+    0xb7UL, 0xfdUL, 0x93UL, 0x26UL, 0x36UL, 0x3fUL, 0xf7UL, 0xccUL,
+    0x34UL, 0xa5UL, 0xe5UL, 0xf1UL, 0x71UL, 0xd8UL, 0x31UL, 0x15UL,
+    0x04UL, 0xc7UL, 0x23UL, 0xc3UL, 0x18UL, 0x96UL, 0x05UL, 0x9aUL,
+    0x07UL, 0x12UL, 0x80UL, 0xe2UL, 0xebUL, 0x27UL, 0xb2UL, 0x75UL,
+    0x09UL, 0x83UL, 0x2cUL, 0x1aUL, 0x1bUL, 0x6eUL, 0x5aUL, 0xa0UL,
+    0x52UL, 0x3bUL, 0xd6UL, 0xb3UL, 0x29UL, 0xe3UL, 0x2fUL, 0x84UL,
+    0x53UL, 0xd1UL, 0x00UL, 0xedUL, 0x20UL, 0xfcUL, 0xb1UL, 0x5bUL,
+    0x6aUL, 0xcbUL, 0xbeUL, 0x39UL, 0x4aUL, 0x4cUL, 0x58UL, 0xcfUL,
+    0xd0UL, 0xefUL, 0xaaUL, 0xfbUL, 0x43UL, 0x4dUL, 0x33UL, 0x85UL,
+    0x45UL, 0xf9UL, 0x02UL, 0x7fUL, 0x50UL, 0x3cUL, 0x9fUL, 0xa8UL,
+    0x51UL, 0xa3UL, 0x40UL, 0x8fUL, 0x92UL, 0x9dUL, 0x38UL, 0xf5UL,
+    0xbcUL, 0xb6UL, 0xdaUL, 0x21UL, 0x10UL, 0xffUL, 0xf3UL, 0xd2UL,
+    0xcdUL, 0x0cUL, 0x13UL, 0xecUL, 0x5fUL, 0x97UL, 0x44UL, 0x17UL,
+    0xc4UL, 0xa7UL, 0x7eUL, 0x3dUL, 0x64UL, 0x5dUL, 0x19UL, 0x73UL,
+    0x60UL, 0x81UL, 0x4fUL, 0xdcUL, 0x22UL, 0x2aUL, 0x90UL, 0x88UL,
+    0x46UL, 0xeeUL, 0xb8UL, 0x14UL, 0xdeUL, 0x5eUL, 0x0bUL, 0xdbUL,
+    0xe0UL, 0x32UL, 0x3aUL, 0x0aUL, 0x49UL, 0x06UL, 0x24UL, 0x5cUL,
+    0xc2UL, 0xd3UL, 0xacUL, 0x62UL, 0x91UL, 0x95UL, 0xe4UL, 0x79UL,
+    0xe7UL, 0xc8UL, 0x37UL, 0x6dUL, 0x8dUL, 0xd5UL, 0x4eUL, 0xa9UL,
+    0x6cUL, 0x56UL, 0xf4UL, 0xeaUL, 0x65UL, 0x7aUL, 0xaeUL, 0x08UL,
+    0xbaUL, 0x78UL, 0x25UL, 0x2eUL, 0x1cUL, 0xa6UL, 0xb4UL, 0xc6UL,
+    0xe8UL, 0xddUL, 0x74UL, 0x1fUL, 0x4bUL, 0xbdUL, 0x8bUL, 0x8aUL,
+    0x70UL, 0x3eUL, 0xb5UL, 0x66UL, 0x48UL, 0x03UL, 0xf6UL, 0x0eUL,
+    0x61UL, 0x35UL, 0x57UL, 0xb9UL, 0x86UL, 0xc1UL, 0x1dUL, 0x9eUL,
+    0xe1UL, 0xf8UL, 0x98UL, 0x11UL, 0x69UL, 0xd9UL, 0x8eUL, 0x94UL,
+    0x9bUL, 0x1eUL, 0x87UL, 0xe9UL, 0xceUL, 0x55UL, 0x28UL, 0xdfUL,
+    0x8cUL, 0xa1UL, 0x89UL, 0x0dUL, 0xbfUL, 0xe6UL, 0x42UL, 0x68UL,
+    0x41UL, 0x99UL, 0x2dUL, 0x0fUL, 0xb0UL, 0x54UL, 0xbbUL, 0x16UL,
+    //rcon
+    0x01000000UL, 0x02000000UL, 0x04000000UL, 0x08000000UL,
+    0x10000000UL, 0x20000000UL, 0x40000000UL, 0x80000000UL,
+    0x1B000000UL, 0x36000000UL, 0UL, 0UL,
+    0UL, 0UL, 0UL, 0UL
+};
+
+
+/**
+ *  AES_TD[] table for AES decryption
+ */
+juint StubRoutines::aarch32::_aes_td_table[]
+                   __attribute__ ((aligned(2048))) =
+{
+    //T
+    0x51f4a750UL, 0x7e416553UL, 0x1a17a4c3UL, 0x3a275e96UL,
+    0x3bab6bcbUL, 0x1f9d45f1UL, 0xacfa58abUL, 0x4be30393UL,
+    0x2030fa55UL, 0xad766df6UL, 0x88cc7691UL, 0xf5024c25UL,
+    0x4fe5d7fcUL, 0xc52acbd7UL, 0x26354480UL, 0xb562a38fUL,
+    0xdeb15a49UL, 0x25ba1b67UL, 0x45ea0e98UL, 0x5dfec0e1UL,
+    0xc32f7502UL, 0x814cf012UL, 0x8d4697a3UL, 0x6bd3f9c6UL,
+    0x038f5fe7UL, 0x15929c95UL, 0xbf6d7aebUL, 0x955259daUL,
+    0xd4be832dUL, 0x587421d3UL, 0x49e06929UL, 0x8ec9c844UL,
+    0x75c2896aUL, 0xf48e7978UL, 0x99583e6bUL, 0x27b971ddUL,
+    0xbee14fb6UL, 0xf088ad17UL, 0xc920ac66UL, 0x7dce3ab4UL,
+    0x63df4a18UL, 0xe51a3182UL, 0x97513360UL, 0x62537f45UL,
+    0xb16477e0UL, 0xbb6bae84UL, 0xfe81a01cUL, 0xf9082b94UL,
+    0x70486858UL, 0x8f45fd19UL, 0x94de6c87UL, 0x527bf8b7UL,
+    0xab73d323UL, 0x724b02e2UL, 0xe31f8f57UL, 0x6655ab2aUL,
+    0xb2eb2807UL, 0x2fb5c203UL, 0x86c57b9aUL, 0xd33708a5UL,
+    0x302887f2UL, 0x23bfa5b2UL, 0x02036abaUL, 0xed16825cUL,
+    0x8acf1c2bUL, 0xa779b492UL, 0xf307f2f0UL, 0x4e69e2a1UL,
+    0x65daf4cdUL, 0x0605bed5UL, 0xd134621fUL, 0xc4a6fe8aUL,
+    0x342e539dUL, 0xa2f355a0UL, 0x058ae132UL, 0xa4f6eb75UL,
+    0x0b83ec39UL, 0x4060efaaUL, 0x5e719f06UL, 0xbd6e1051UL,
+    0x3e218af9UL, 0x96dd063dUL, 0xdd3e05aeUL, 0x4de6bd46UL,
+    0x91548db5UL, 0x71c45d05UL, 0x0406d46fUL, 0x605015ffUL,
+    0x1998fb24UL, 0xd6bde997UL, 0x894043ccUL, 0x67d99e77UL,
+    0xb0e842bdUL, 0x07898b88UL, 0xe7195b38UL, 0x79c8eedbUL,
+    0xa17c0a47UL, 0x7c420fe9UL, 0xf8841ec9UL, 0x00000000UL,
+    0x09808683UL, 0x322bed48UL, 0x1e1170acUL, 0x6c5a724eUL,
+    0xfd0efffbUL, 0x0f853856UL, 0x3daed51eUL, 0x362d3927UL,
+    0x0a0fd964UL, 0x685ca621UL, 0x9b5b54d1UL, 0x24362e3aUL,
+    0x0c0a67b1UL, 0x9357e70fUL, 0xb4ee96d2UL, 0x1b9b919eUL,
+    0x80c0c54fUL, 0x61dc20a2UL, 0x5a774b69UL, 0x1c121a16UL,
+    0xe293ba0aUL, 0xc0a02ae5UL, 0x3c22e043UL, 0x121b171dUL,
+    0x0e090d0bUL, 0xf28bc7adUL, 0x2db6a8b9UL, 0x141ea9c8UL,
+    0x57f11985UL, 0xaf75074cUL, 0xee99ddbbUL, 0xa37f60fdUL,
+    0xf701269fUL, 0x5c72f5bcUL, 0x44663bc5UL, 0x5bfb7e34UL,
+    0x8b432976UL, 0xcb23c6dcUL, 0xb6edfc68UL, 0xb8e4f163UL,
+    0xd731dccaUL, 0x42638510UL, 0x13972240UL, 0x84c61120UL,
+    0x854a247dUL, 0xd2bb3df8UL, 0xaef93211UL, 0xc729a16dUL,
+    0x1d9e2f4bUL, 0xdcb230f3UL, 0x0d8652ecUL, 0x77c1e3d0UL,
+    0x2bb3166cUL, 0xa970b999UL, 0x119448faUL, 0x47e96422UL,
+    0xa8fc8cc4UL, 0xa0f03f1aUL, 0x567d2cd8UL, 0x223390efUL,
+    0x87494ec7UL, 0xd938d1c1UL, 0x8ccaa2feUL, 0x98d40b36UL,
+    0xa6f581cfUL, 0xa57ade28UL, 0xdab78e26UL, 0x3fadbfa4UL,
+    0x2c3a9de4UL, 0x5078920dUL, 0x6a5fcc9bUL, 0x547e4662UL,
+    0xf68d13c2UL, 0x90d8b8e8UL, 0x2e39f75eUL, 0x82c3aff5UL,
+    0x9f5d80beUL, 0x69d0937cUL, 0x6fd52da9UL, 0xcf2512b3UL,
+    0xc8ac993bUL, 0x10187da7UL, 0xe89c636eUL, 0xdb3bbb7bUL,
+    0xcd267809UL, 0x6e5918f4UL, 0xec9ab701UL, 0x834f9aa8UL,
+    0xe6956e65UL, 0xaaffe67eUL, 0x21bccf08UL, 0xef15e8e6UL,
+    0xbae79bd9UL, 0x4a6f36ceUL, 0xea9f09d4UL, 0x29b07cd6UL,
+    0x31a4b2afUL, 0x2a3f2331UL, 0xc6a59430UL, 0x35a266c0UL,
+    0x744ebc37UL, 0xfc82caa6UL, 0xe090d0b0UL, 0x33a7d815UL,
+    0xf104984aUL, 0x41ecdaf7UL, 0x7fcd500eUL, 0x1791f62fUL,
+    0x764dd68dUL, 0x43efb04dUL, 0xccaa4d54UL, 0xe49604dfUL,
+    0x9ed1b5e3UL, 0x4c6a881bUL, 0xc12c1fb8UL, 0x4665517fUL,
+    0x9d5eea04UL, 0x018c355dUL, 0xfa877473UL, 0xfb0b412eUL,
+    0xb3671d5aUL, 0x92dbd252UL, 0xe9105633UL, 0x6dd64713UL,
+    0x9ad7618cUL, 0x37a10c7aUL, 0x59f8148eUL, 0xeb133c89UL,
+    0xcea927eeUL, 0xb761c935UL, 0xe11ce5edUL, 0x7a47b13cUL,
+    0x9cd2df59UL, 0x55f2733fUL, 0x1814ce79UL, 0x73c737bfUL,
+    0x53f7cdeaUL, 0x5ffdaa5bUL, 0xdf3d6f14UL, 0x7844db86UL,
+    0xcaaff381UL, 0xb968c43eUL, 0x3824342cUL, 0xc2a3405fUL,
+    0x161dc372UL, 0xbce2250cUL, 0x283c498bUL, 0xff0d9541UL,
+    0x39a80171UL, 0x080cb3deUL, 0xd8b4e49cUL, 0x6456c190UL,
+    0x7bcb8461UL, 0xd532b670UL, 0x486c5c74UL, 0xd0b85742UL,
+//S
+    0x52UL, 0x09UL, 0x6aUL, 0xd5UL, 0x30UL, 0x36UL, 0xa5UL, 0x38UL,
+    0xbfUL, 0x40UL, 0xa3UL, 0x9eUL, 0x81UL, 0xf3UL, 0xd7UL, 0xfbUL,
+    0x7cUL, 0xe3UL, 0x39UL, 0x82UL, 0x9bUL, 0x2fUL, 0xffUL, 0x87UL,
+    0x34UL, 0x8eUL, 0x43UL, 0x44UL, 0xc4UL, 0xdeUL, 0xe9UL, 0xcbUL,
+    0x54UL, 0x7bUL, 0x94UL, 0x32UL, 0xa6UL, 0xc2UL, 0x23UL, 0x3dUL,
+    0xeeUL, 0x4cUL, 0x95UL, 0x0bUL, 0x42UL, 0xfaUL, 0xc3UL, 0x4eUL,
+    0x08UL, 0x2eUL, 0xa1UL, 0x66UL, 0x28UL, 0xd9UL, 0x24UL, 0xb2UL,
+    0x76UL, 0x5bUL, 0xa2UL, 0x49UL, 0x6dUL, 0x8bUL, 0xd1UL, 0x25UL,
+    0x72UL, 0xf8UL, 0xf6UL, 0x64UL, 0x86UL, 0x68UL, 0x98UL, 0x16UL,
+    0xd4UL, 0xa4UL, 0x5cUL, 0xccUL, 0x5dUL, 0x65UL, 0xb6UL, 0x92UL,
+    0x6cUL, 0x70UL, 0x48UL, 0x50UL, 0xfdUL, 0xedUL, 0xb9UL, 0xdaUL,
+    0x5eUL, 0x15UL, 0x46UL, 0x57UL, 0xa7UL, 0x8dUL, 0x9dUL, 0x84UL,
+    0x90UL, 0xd8UL, 0xabUL, 0x00UL, 0x8cUL, 0xbcUL, 0xd3UL, 0x0aUL,
+    0xf7UL, 0xe4UL, 0x58UL, 0x05UL, 0xb8UL, 0xb3UL, 0x45UL, 0x06UL,
+    0xd0UL, 0x2cUL, 0x1eUL, 0x8fUL, 0xcaUL, 0x3fUL, 0x0fUL, 0x02UL,
+    0xc1UL, 0xafUL, 0xbdUL, 0x03UL, 0x01UL, 0x13UL, 0x8aUL, 0x6bUL,
+    0x3aUL, 0x91UL, 0x11UL, 0x41UL, 0x4fUL, 0x67UL, 0xdcUL, 0xeaUL,
+    0x97UL, 0xf2UL, 0xcfUL, 0xceUL, 0xf0UL, 0xb4UL, 0xe6UL, 0x73UL,
+    0x96UL, 0xacUL, 0x74UL, 0x22UL, 0xe7UL, 0xadUL, 0x35UL, 0x85UL,
+    0xe2UL, 0xf9UL, 0x37UL, 0xe8UL, 0x1cUL, 0x75UL, 0xdfUL, 0x6eUL,
+    0x47UL, 0xf1UL, 0x1aUL, 0x71UL, 0x1dUL, 0x29UL, 0xc5UL, 0x89UL,
+    0x6fUL, 0xb7UL, 0x62UL, 0x0eUL, 0xaaUL, 0x18UL, 0xbeUL, 0x1bUL,
+    0xfcUL, 0x56UL, 0x3eUL, 0x4bUL, 0xc6UL, 0xd2UL, 0x79UL, 0x20UL,
+    0x9aUL, 0xdbUL, 0xc0UL, 0xfeUL, 0x78UL, 0xcdUL, 0x5aUL, 0xf4UL,
+    0x1fUL, 0xddUL, 0xa8UL, 0x33UL, 0x88UL, 0x07UL, 0xc7UL, 0x31UL,
+    0xb1UL, 0x12UL, 0x10UL, 0x59UL, 0x27UL, 0x80UL, 0xecUL, 0x5fUL,
+    0x60UL, 0x51UL, 0x7fUL, 0xa9UL, 0x19UL, 0xb5UL, 0x4aUL, 0x0dUL,
+    0x2dUL, 0xe5UL, 0x7aUL, 0x9fUL, 0x93UL, 0xc9UL, 0x9cUL, 0xefUL,
+    0xa0UL, 0xe0UL, 0x3bUL, 0x4dUL, 0xaeUL, 0x2aUL, 0xf5UL, 0xb0UL,
+    0xc8UL, 0xebUL, 0xbbUL, 0x3cUL, 0x83UL, 0x53UL, 0x99UL, 0x61UL,
+    0x17UL, 0x2bUL, 0x04UL, 0x7eUL, 0xbaUL, 0x77UL, 0xd6UL, 0x26UL,
+    0xe1UL, 0x69UL, 0x14UL, 0x63UL, 0x55UL, 0x21UL, 0x0cUL, 0x7dUL
+};
+
+/**
+ *  SHA256[] table for SHA256 Digest
+ */
+juint StubRoutines::aarch32::_sha1_table[]
+                   __attribute__ ((aligned(1024))) =
+{
+    //k
+    0x5A827999UL, 0x6ED9EBA1UL, 0x8F1BBCDCUL, 0xCA62C1D6UL
+};
+
+/**
+ *  SHA256[] table for SHA256 Digest
+ */
+juint StubRoutines::aarch32::_sha256_table[]
+                   __attribute__ ((aligned(1024))) =
+{
+    //k
+    0x428A2F98UL, 0x71374491UL, 0xB5C0FBCFUL, 0xE9B5DBA5UL,
+    0x3956C25BUL, 0x59F111F1UL, 0x923F82A4UL, 0xAB1C5ED5UL,
+    0xD807AA98UL, 0x12835B01UL, 0x243185BEUL, 0x550C7DC3UL,
+    0x72BE5D74UL, 0x80DEB1FEUL, 0x9BDC06A7UL, 0xC19BF174UL,
+    0xE49B69C1UL, 0xEFBE4786UL, 0x0FC19DC6UL, 0x240CA1CCUL,
+    0x2DE92C6FUL, 0x4A7484AAUL, 0x5CB0A9DCUL, 0x76F988DAUL,
+    0x983E5152UL, 0xA831C66DUL, 0xB00327C8UL, 0xBF597FC7UL,
+    0xC6E00BF3UL, 0xD5A79147UL, 0x06CA6351UL, 0x14292967UL,
+    0x27B70A85UL, 0x2E1B2138UL, 0x4D2C6DFCUL, 0x53380D13UL,
+    0x650A7354UL, 0x766A0ABBUL, 0x81C2C92EUL, 0x92722C85UL,
+    0xA2BFE8A1UL, 0xA81A664BUL, 0xC24B8B70UL, 0xC76C51A3UL,
+    0xD192E819UL, 0xD6990624UL, 0xF40E3585UL, 0x106AA070UL,
+    0x19A4C116UL, 0x1E376C08UL, 0x2748774CUL, 0x34B0BCB5UL,
+    0x391C0CB3UL, 0x4ED8AA4AUL, 0x5B9CCA4FUL, 0x682E6FF3UL,
+    0x748F82EEUL, 0x78A5636FUL, 0x84C87814UL, 0x8CC70208UL,
+    0x90BEFFFAUL, 0xA4506CEBUL, 0xBEF9A3F7UL, 0xC67178F2UL
+};
+
+/**
+ *  SHA512[] table for SHA512 Digest
+ */
+julong StubRoutines::aarch32::_sha512_table[]
+                   __attribute__ ((aligned(1024))) =
+{
+    //k
+  0x428a2f98d728ae22UL, 0x7137449123ef65cdUL, 0xb5c0fbcfec4d3b2fUL, 0xe9b5dba58189dbbcUL,
+  0x3956c25bf348b538UL, 0x59f111f1b605d019UL, 0x923f82a4af194f9bUL, 0xab1c5ed5da6d8118UL,
+  0xd807aa98a3030242UL, 0x12835b0145706fbeUL, 0x243185be4ee4b28cUL, 0x550c7dc3d5ffb4e2UL,
+  0x72be5d74f27b896fUL, 0x80deb1fe3b1696b1UL, 0x9bdc06a725c71235UL, 0xc19bf174cf692694UL,
+  0xe49b69c19ef14ad2UL, 0xefbe4786384f25e3UL, 0x0fc19dc68b8cd5b5UL, 0x240ca1cc77ac9c65UL,
+  0x2de92c6f592b0275UL, 0x4a7484aa6ea6e483UL, 0x5cb0a9dcbd41fbd4UL, 0x76f988da831153b5UL,
+  0x983e5152ee66dfabUL, 0xa831c66d2db43210UL, 0xb00327c898fb213fUL, 0xbf597fc7beef0ee4UL,
+  0xc6e00bf33da88fc2UL, 0xd5a79147930aa725UL, 0x06ca6351e003826fUL, 0x142929670a0e6e70UL,
+  0x27b70a8546d22ffcUL, 0x2e1b21385c26c926UL, 0x4d2c6dfc5ac42aedUL, 0x53380d139d95b3dfUL,
+  0x650a73548baf63deUL, 0x766a0abb3c77b2a8UL, 0x81c2c92e47edaee6UL, 0x92722c851482353bUL,
+  0xa2bfe8a14cf10364UL, 0xa81a664bbc423001UL, 0xc24b8b70d0f89791UL, 0xc76c51a30654be30UL,
+  0xd192e819d6ef5218UL, 0xd69906245565a910UL, 0xf40e35855771202aUL, 0x106aa07032bbd1b8UL,
+  0x19a4c116b8d2d0c8UL, 0x1e376c085141ab53UL, 0x2748774cdf8eeb99UL, 0x34b0bcb5e19b48a8UL,
+  0x391c0cb3c5c95a63UL, 0x4ed8aa4ae3418acbUL, 0x5b9cca4f7763e373UL, 0x682e6ff3d6b2b8a3UL,
+  0x748f82ee5defb2fcUL, 0x78a5636f43172f60UL, 0x84c87814a1f0ab72UL, 0x8cc702081a6439ecUL,
+  0x90befffa23631e28UL, 0xa4506cebde82bde9UL, 0xbef9a3f7b2c67915UL, 0xc67178f2e372532bUL,
+  0xca273eceea26619cUL, 0xd186b8c721c0c207UL, 0xeada7dd6cde0eb1eUL, 0xf57d4f7fee6ed178UL,
+  0x06f067aa72176fbaUL, 0x0a637dc5a2c898a6UL, 0x113f9804bef90daeUL, 0x1b710b35131c471bUL,
+  0x28db77f523047d84UL, 0x32caab7b40c72493UL, 0x3c9ebe0a15c9bebcUL, 0x431d67c49c100d4cUL,
+  0x4cc5d4becb3e42b6UL, 0x597f299cfc657e2aUL, 0x5fcb6fab3ad6faecUL, 0x6c44198c4a475817UL
+};
+
+address StubRoutines::_cipherBlockChaining_encryptAESCrypt_special = NULL;
+address StubRoutines::_cipherBlockChaining_decryptAESCrypt_special = NULL;
+address StubRoutines::_aes_table_te_addr = NULL;
+address StubRoutines::_aes_table_td_addr = NULL;
+
+address StubRoutines::_sha1_table_addr       = NULL;
+address StubRoutines::_sha256_table_addr     = NULL;
+address StubRoutines::_sha512_table_addr     = NULL;
--- /dev/null	2018-09-25 19:25:27.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/stubRoutines_aarch32.hpp	2018-09-25 19:25:27.000000000 +0300
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2003, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_STUBROUTINES_AARCH32_HPP
+#define CPU_AARCH32_VM_STUBROUTINES_AARCH32_HPP
+
+// This file holds the platform specific parts of the StubRoutines
+// definition. See stubRoutines.hpp for a description on how to
+// extend it.
+
+// n.b. if we are notifying entry/exit to the simulator then the call
+// stub does a notify at normal return placing
+// call_stub_return_address one instruction beyond the notify. the
+// latter address is sued by the stack unwind code when doign an
+// exception return.
+static bool    returns_to_call_stub(address return_pc)   {
+  return return_pc == _call_stub_return_address;
+}
+
+enum platform_dependent_constants {
+  code_size1 = 19000,          // simply increase if too small (assembler will crash if too small)
+  code_size2 = 22000           // simply increase if too small (assembler will crash if too small)
+};
+
+class aarch32 {
+ friend class StubGenerator;
+
+ private:
+#ifdef COMPILER2
+  static address _idiv_entry;
+  static address _irem_entry;
+  static address _partial_subtype_check;
+  static address _string_compress_neon;
+  static address _string_inflate_neon;
+#endif
+
+ public:
+
+#ifdef COMPILER2
+  static address idiv_entry() {
+    return _idiv_entry;
+  }
+
+  static address irem_entry() {
+    return _irem_entry;
+  }
+
+  static address partial_subtype_check() {
+    return _partial_subtype_check;
+  }
+
+  static address string_compress_neon() {
+    return _string_compress_neon;
+  }
+
+  static address string_inflate_neon() {
+    return _string_inflate_neon;
+  }
+#endif
+
+ private:
+  static juint    _crc_table[];
+  static juint    _crc32c_table[];
+
+ private:
+  static juint    _aes_te_table[];
+  static juint    _aes_td_table[];
+
+ private:
+  static juint    _sha1_table[];
+  static juint    _sha256_table[];
+  static julong   _sha512_table[];
+};
+
+
+  static address _cipherBlockChaining_encryptAESCrypt_special;
+  static address _cipherBlockChaining_decryptAESCrypt_special;
+
+  static address _aes_table_te_addr;
+  static address _aes_table_td_addr;
+
+  static address _sha1_table_addr;
+  static address _sha256_table_addr;
+  static address _sha512_table_addr;
+
+public:
+  static address cipherBlockChaining_encryptAESCrypt_special()  { return _cipherBlockChaining_encryptAESCrypt_special; }
+  static address cipherBlockChaining_decryptAESCrypt_special()  { return _cipherBlockChaining_decryptAESCrypt_special; }
+  static address aes_table_te_addr()      { return _aes_table_te_addr; }
+  static address aes_table_td_addr()      { return _aes_table_td_addr; }
+
+  static address sha1_table_addr()       { return _sha1_table_addr; }
+  static address sha256_table_addr()     { return _sha256_table_addr; }
+  static address sha512_table_addr()     { return _sha512_table_addr; }
+
+#endif // CPU_AARCH32_VM_STUBROUTINES_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:28.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/templateInterpreterGenerator_aarch32.cpp	2018-09-25 19:25:28.000000000 +0300
@@ -0,0 +1,2312 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "interp_masm_aarch32.hpp"
+#include "interpreter/bytecodeHistogram.hpp"
+#include "interpreter/bytecodeTracer.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/templateInterpreterGenerator.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateTable.hpp"
+#include "oops/arrayOop.hpp"
+#include "oops/method.hpp"
+#include "oops/methodData.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/jvmtiExport.hpp"
+#include "prims/jvmtiThreadState.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/deoptimization.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "runtime/timer.hpp"
+#include "runtime/vframeArray.hpp"
+#include "utilities/debug.hpp"
+
+#include <sys/types.h>
+
+#ifndef PRODUCT
+#include "oops/method.hpp"
+#include "vm_version_aarch32.hpp"
+#endif // !PRODUCT
+
+// Size of interpreter code.  Increase if too small.  Interpreter will
+// fail with a guarantee ("not enough space for interpreter generation");
+// if too small.
+// Run with +PrintInterpreter to get the VM to print out the size.
+// Max size with JVMTI
+int TemplateInterpreter::InterpreterCodeSize = 200 * 1024;
+
+#define __ _masm->
+
+//-----------------------------------------------------------------------------
+
+extern "C" void entry(CodeBuffer*);
+
+//-----------------------------------------------------------------------------
+
+address TemplateInterpreterGenerator::generate_slow_signature_handler() {
+  address entry = __ pc();
+
+  // The sp should be aligned on entry to the bottom of where the integer args
+  // need to be copied to.
+
+  // rmethod
+  // rlocals
+  // c_rarg3: first stack arg - wordSize
+
+  __ mov(c_rarg3, sp);
+  __ sub(sp, sp, 22 * wordSize);
+  __ str(lr, sp);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::slow_signature_handler),
+             rmethod, rlocals, c_rarg3);
+
+  // r0: result handler
+
+  // Stack layout:
+  // rsp: return address           <- sp (lowest addr)
+  //      1 float/double identifiers with the following structure:
+  //        16 bit - 2 bits per word free/in use indication (0==in use)
+  //        8 bits - 1 bit per word, double/float indication (0==double)
+  //      4 integer args (if static first is unused)
+  //      8 double args (defined by ARM calling convention spec)
+  //        stack args              <- sp (on entry)
+  //        garbage
+  //        expression stack bottom
+  //        bcp (NULL)
+  //        ...
+  // If this changes, update interpreterRt_aarch32.cpp slowpath!
+
+  // Restore LR
+  __ ldr(lr, sp);
+
+#ifdef HARD_FLOAT_CC
+  // Do FP first so we can use c_rarg3 as temp
+  __ ldr(c_rarg3, Address(sp, wordSize)); // float/double identifiers
+
+  {
+    Label fp_done;
+    // each iteration covers either single double register or up to 2 float registers
+    for (int i = 0; i < Argument::n_float_register_parameters_c; i++) {
+      Label d, done;
+
+      __ tst(c_rarg3, 1 << i+16);
+      __ b(d, __ EQ);
+      __ tst(c_rarg3, 1 << i*2);
+      __ b(fp_done, __ NE);
+      __ vldr_f32(as_FloatRegister(i*2), Address(sp, (6 + 2 * i) * wordSize));
+      __ tst(c_rarg3, 1 << i*2+1);
+      __ vldr_f32(as_FloatRegister(i*2+1), Address(sp, (7 + 2 * i) * wordSize), __ EQ);
+      __ b(done);
+      __ bind(d);
+      __ vldr_f64(as_DoubleFloatRegister(i), Address(sp, (6 + 2 * i) * wordSize));
+      __ bind(done);
+    }
+    __ bind(fp_done);
+  }
+#endif // HARD_FLOAT_CC
+
+  // c_rarg0 contains the result from the call of
+  // InterpreterRuntime::slow_signature_handler so we don't touch it
+  // here.  It will be loaded with the JNIEnv* later.
+  __ ldr(c_rarg1, Address(sp, 2 * wordSize));
+  __ ldrd(c_rarg2, c_rarg3, Address(sp, 3 * wordSize));
+
+  __ add(sp, sp, 22 * wordSize);
+  __ b(lr);
+
+  return entry;
+}
+
+
+//
+// Various method entries
+//
+
+address TemplateInterpreterGenerator::generate_math_entry(AbstractInterpreter::MethodKind kind) {
+  // rmethod: Method*
+  // r4: sender sp
+  // sp: args
+
+  //if (!InlineIntrinsics) return NULL; // Generate a vanilla entry
+  // FIXME currently ignoring this flag and inlining anyway
+
+  // These don't need a safepoint check because they aren't virtually
+  // callable. We won't enter these intrinsics from compiled code.
+  // If in the future we added an intrinsic which was virtually callable
+  // we'd have to worry about how to safepoint so that this code is used.
+
+  // mathematical functions inlined by compiler
+  // (interpreter must provide identical implementation
+  // in order to avoid monotonicity bugs when switching
+  // from interpreter to compiler in the middle of some
+  // computation)
+  //
+  // stack:
+  //        [ arg ] <-- sp
+  //        [ arg ]
+  // retaddr in lr
+
+  address entry_point = NULL;
+  Register continuation = lr;
+  bool transcendental_entry = false;
+
+  switch (kind) {
+  case Interpreter::java_lang_math_abs:
+    entry_point = __ pc();
+      if(hasFPU()) {
+        __ vldr_f64(d0, Address(sp));
+        __ vabs_f64(d0, d0);
+      } else {
+        __ ldrd(r0, Address(sp));
+        transcendental_entry = true;
+      }
+    break;
+  case Interpreter::java_lang_math_sqrt:
+    entry_point = __ pc();
+    if(hasFPU()) {
+        __ vldr_f64(d0, Address(sp));
+        __ vsqrt_f64(d0, d0);
+    } else {
+        __ ldrd(r0, Address(sp));
+        transcendental_entry = true;
+    }
+    break;
+  case Interpreter::java_lang_math_sin :
+  case Interpreter::java_lang_math_cos :
+  case Interpreter::java_lang_math_tan :
+  case Interpreter::java_lang_math_log :
+  case Interpreter::java_lang_math_log10 :
+  case Interpreter::java_lang_math_exp :
+    entry_point = __ pc();
+    transcendental_entry = true;
+#ifndef HARD_FLOAT_CC
+    __ ldrd(r0, Address(sp));
+#else
+    __ vldr_f64(d0, Address(sp));
+#endif //HARD_FLOAT_CC
+    break;
+  case Interpreter::java_lang_math_pow :
+    entry_point = __ pc();
+    transcendental_entry = true;
+#ifndef HARD_FLOAT_CC
+    __ ldrd(r0, Address(sp, 2*Interpreter::stackElementSize));
+    __ ldrd(r2, Address(sp));
+#else
+    __ vldr_f64(d0, Address(sp, 2*Interpreter::stackElementSize));
+    __ vldr_f64(d1, Address(sp));
+#endif //HARD_FLOAT_CC
+    break;
+  case Interpreter::java_lang_math_fmaD :
+  case Interpreter::java_lang_math_fmaF :
+    if (UseFMA) {
+      __ unimplemented();
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  if (entry_point) {
+    __ mov(sp, r4);
+
+    if(transcendental_entry) {
+      __ mov(r4, lr);
+      continuation = r4;
+      generate_transcendental_entry(kind);
+#ifndef HARD_FLOAT_CC
+      if(hasFPU()) {
+        __ vmov_f64(d0, r0, r1);
+      }
+#endif
+    }
+
+    __ b(continuation);
+  }
+
+  return entry_point;
+}
+
+  // double trigonometrics and transcendentals
+  // static jdouble dsin(jdouble x);
+  // static jdouble dcos(jdouble x);
+  // static jdouble dtan(jdouble x);
+  // static jdouble dlog(jdouble x);
+  // static jdouble dlog10(jdouble x);
+  // static jdouble dexp(jdouble x);
+  // static jdouble dpow(jdouble x, jdouble y);
+
+void TemplateInterpreterGenerator::generate_transcendental_entry(AbstractInterpreter::MethodKind kind) {
+  address fn = NULL;
+  switch (kind) {
+#ifdef __SOFTFP__
+  case  Interpreter::java_lang_math_abs:
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dabs);
+    break;
+  case Interpreter::java_lang_math_sqrt:
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dsqrt);
+    break;
+#endif //__SOFTFP__
+  case Interpreter::java_lang_math_sin :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dsin);
+    break;
+  case Interpreter::java_lang_math_cos :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dcos);
+    break;
+  case Interpreter::java_lang_math_tan :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dtan);
+    break;
+  case Interpreter::java_lang_math_log :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog);
+    break;
+  case Interpreter::java_lang_math_log10 :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dlog10);
+    break;
+  case Interpreter::java_lang_math_exp :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dexp);
+    break;
+  case Interpreter::java_lang_math_pow :
+    fn = CAST_FROM_FN_PTR(address, SharedRuntime::dpow);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  __ align_stack();
+  __ mov(rscratch1, fn);
+  __ bl(rscratch1);
+}
+
+// Abstract method entry
+// Attempt to execute abstract method. Throw exception
+address TemplateInterpreterGenerator::generate_abstract_entry(void) {
+  // rmethod: Method*
+  // r13: sender SP
+
+  address entry_point = __ pc();
+
+  // abstract method entry
+
+  //  pop return address, reset last_sp to NULL
+  __ empty_expression_stack();
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+
+  // throw exception
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                     InterpreterRuntime::throw_AbstractMethodErrorWithMethod),
+                                     rmethod);
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+
+  return entry_point;
+}
+
+address TemplateInterpreterGenerator::generate_StackOverflowError_handler() {
+  address entry = __ pc();
+
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rfp,
+                       frame::get_interpreter_frame_monitor_block_top_offset() *
+                       wordSize));
+    __ mov(rscratch2, sp);
+    __ cmp(rscratch1, rscratch2); // maximal rsp for current rfp (stack
+                           // grows negative)
+    __ b(L, Assembler::HS); // check if frame is complete
+    __ stop ("interpreter frame not set up");
+    __ bind(L);
+  }
+#endif // ASSERT
+  // Restore bcp under the assumption that the current frame is still
+  // interpreted
+  __ restore_bcp();
+
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // throw exception
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::throw_StackOverflowError));
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ArrayIndexOutOfBounds_handler() {
+  address entry = __ pc();
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // setup parameters
+
+  // ??? convention: expect aberrant index in register r2
+  // ??? convention: expect array in register r3
+  __ mov(c_rarg1, r3);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::
+                              throw_ArrayIndexOutOfBoundsException),
+             c_rarg1, c_rarg2);
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_ClassCastException_handler() {
+  address entry = __ pc();
+
+  // object is at TOS
+  __ pop(c_rarg1);
+
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::
+                              throw_ClassCastException),
+             c_rarg1);
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_exception_handler_common(
+        const char* name, const char* message, bool pass_oop) {
+  assert(!pass_oop || message == NULL, "either oop or message but not both");
+  address entry = __ pc();
+  if (pass_oop) {
+    // object is at TOS
+    __ pop(c_rarg2);
+  }
+  // expression stack must be empty before entering the VM if an
+  // exception happened
+  __ empty_expression_stack();
+  // FIXME shouldn't it be in rest of generate_* ?
+  // rdispatch assumed to cache dispatch table. This code can be called from
+  // signal handler, so it can't assume execption caller preserved the register,
+  // so restore it here
+  __ get_dispatch();
+  // FIXME shouldn't get_method be here ?
+  // setup parameters
+  __ lea(c_rarg1, Address((address)name));
+  if (pass_oop) {
+    __ call_VM(r0, CAST_FROM_FN_PTR(address,
+                                    InterpreterRuntime::
+                                    create_klass_exception),
+               c_rarg1, c_rarg2);
+  } else {
+    // kind of lame ExternalAddress can't take NULL because
+    // external_word_Relocation will assert.
+    if (message != NULL) {
+      __ lea(c_rarg2, Address((address)message));
+    } else {
+      __ mov(c_rarg2, NULL_WORD);
+    }
+    __ call_VM(r0,
+               CAST_FROM_FN_PTR(address, InterpreterRuntime::create_exception),
+               c_rarg1, c_rarg2);
+  }
+  // throw exception
+  __ b(address(Interpreter::throw_exception_entry()));
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_return_entry_for(TosState state, int step, size_t index_size) {
+  address entry = __ pc();
+
+  __ print_method_exit();
+  __ reg_printf("A. return_entry <r1:r0> : 0x%08x%08x\n", r1, r0);
+
+  // Restore stack bottom in case i2c adjusted stack
+  __ ldr(sp, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  // and NULL it as marker that sp is now tos until next java call
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  __ reg_printf("B. return_entry <r1:r0> : 0x%08x%08x\n", r1, r0);
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+  __ reg_printf("C. return_entry <r1:r0> : 0x%08x%08x\n", r1, r0);
+
+  if (state == atos) {
+    Register obj = r0;
+    Register mdp = r1;
+    Register tmp = r2;
+    __ ldr(mdp, Address(rmethod, Method::method_data_offset()));
+    __ profile_return_type(mdp, obj, tmp);
+  }
+
+  // Pop N words from the stack
+  __ get_cache_and_index_at_bcp(r3, r2, 1, index_size);
+  __ reg_printf("D. return_entry <r1:r0> : 0x%08x%08x\n", r1, r0);
+  __ ldr(r3, Address(r3, ConstantPoolCache::base_offset() + ConstantPoolCacheEntry::flags_offset()));
+  __ andr(r3, r3, ConstantPoolCacheEntry::parameter_size_mask);
+
+  __ add(sp, sp, r3, lsl(2));
+
+  // Restore machine SP
+  /*__ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+  __ ldr(rscratch2,
+         Address(rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, lsl(2));
+  __ bic(sp, rscratch1, 0xf);*/
+
+  __ check_and_handle_popframe(rthread);
+  __ check_and_handle_earlyret(rthread);
+
+  __ get_dispatch();
+  __ reg_printf("E. return_entry <r1:r0> : 0x%08x%08x\n", r1, r0);
+  __ dispatch_next(state, step);
+
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_deopt_entry_for(TosState state,
+                                                               int step,
+                                                               address continuation) {
+  address entry = __ pc();
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+
+  __ get_dispatch();
+
+  // Calculate stack limit
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+  __ ldr(rscratch2,
+         Address(rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, lsl(2));
+  __ bic(sp, rscratch1, 0xf);
+
+  // Restore expression stack pointer
+  __ ldr(sp, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  // NULL last_sp until next java call
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+
+  // handle exceptions
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::throw_pending_exception));
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  if (continuation == NULL) {
+    __ dispatch_next(state, step);
+  } else {
+    __ jump_to_entry(continuation);
+  }
+  return entry;
+}
+
+
+address TemplateInterpreterGenerator::generate_result_handler_for(
+        BasicType type) {
+  address entry = __ pc();
+  switch (type) {
+  case T_BOOLEAN: __ c2bool(r0);         break;
+  case T_CHAR   : __ uxth(r0, r0);       break;
+  case T_BYTE   : __ sxtb(r0, r0);       break;
+  case T_SHORT  : __ sxth(r0, r0);       break;
+  case T_INT    : /* nothing to do */    break;
+  case T_LONG   : /* nothing to do */    break;
+  case T_VOID   : /* nothing to do */    break;
+  case T_FLOAT  :
+#ifndef HARD_FLOAT_CC
+      if(hasFPU()) {
+          __ vmov_f32(d0, r0);
+      }
+#endif
+      break;
+  case T_DOUBLE :
+#ifndef HARD_FLOAT_CC
+      if(hasFPU()) {
+          __ vmov_f64(d0, r0, r1);
+      }
+#endif
+    break;
+  case T_OBJECT :
+    // retrieve result from frame
+    __ reg_printf("In object result handler\n");
+    __ ldr(r0, Address(rfp, frame::get_interpreter_frame_oop_temp_offset()*wordSize));
+    // and verify it
+    __ verify_oop(r0);
+    break;
+  default       : ShouldNotReachHere();
+  }
+  __ b(lr);                                  // return from result handler
+  return entry;
+}
+
+address TemplateInterpreterGenerator::generate_safept_entry_for(
+        TosState state,
+        address runtime_entry) {
+  address entry = __ pc();
+  __ push(state);
+  __ call_VM(noreg, runtime_entry);
+  __ membar(Assembler::AnyAny);
+  __ dispatch_via(vtos, Interpreter::_normal_table.table_for(vtos));
+  return entry;
+}
+
+// Helpers for commoning out cases in the various type of method entries.
+//
+
+
+// increment invocation count & check for overflow
+//
+// Note: checking for negative value instead of overflow
+//       so we have a 'sticky' overflow test
+//
+// rmethod: method
+//
+void TemplateInterpreterGenerator::generate_counter_incr(
+        Label* overflow,
+        Label* profile_method,
+        Label* profile_method_continue) {
+  Label done;
+  // Note: In tiered we increment either counters in Method* or in MDO depending if we're profiling or not.
+  if (TieredCompilation) {
+    int increment = InvocationCounter::count_increment;
+    Label no_mdo;
+    if (ProfileInterpreter) {
+      // Are we profiling?
+      __ ldr(r0, Address(rmethod, Method::method_data_offset()));
+      __ cbz(r0, no_mdo);
+      // Increment counter in the MDO
+      const Address mdo_invocation_counter(r0, in_bytes(MethodData::invocation_counter_offset()) +
+                                           in_bytes(InvocationCounter::counter_offset()));
+      const Address mask(r0, in_bytes(MethodData::invoke_mask_offset()));
+      __ increment_mask_and_jump(mdo_invocation_counter, increment, mask, rscratch1, rscratch2, false, Assembler::EQ, overflow);
+      __ b(done);
+    }
+    __ bind(no_mdo);
+    // Increment counter in MethodCounters
+    const Address invocation_counter(rscratch2,
+                  MethodCounters::invocation_counter_offset() +
+                  InvocationCounter::counter_offset());
+    const Address mask(rscratch2, in_bytes(MethodCounters::invoke_mask_offset()));
+    __ get_method_counters(rmethod, rscratch2, done);
+    __ increment_mask_and_jump(invocation_counter, increment, mask, rscratch1, rscratch2, false, Assembler::EQ, overflow);
+    __ bind(done);
+  } else { // not TieredCompilation
+    const Address backedge_counter(rscratch2,
+                  MethodCounters::backedge_counter_offset() +
+                  InvocationCounter::counter_offset());
+    const Address invocation_counter(rscratch2,
+                  MethodCounters::invocation_counter_offset() +
+                  InvocationCounter::counter_offset());
+
+    __ get_method_counters(rmethod, rscratch2, done);
+
+    if (ProfileInterpreter) { // %%% Merge this into MethodData*
+      __ ldr(r1, Address(rscratch2, MethodCounters::interpreter_invocation_counter_offset()));
+      __ add(r1, r1, 1);
+      __ str(r1, Address(rscratch2, MethodCounters::interpreter_invocation_counter_offset()));
+    }
+    // Update standard invocation counters
+    __ ldr(r1, invocation_counter);
+    __ ldr(r0, backedge_counter);
+
+    __ add(r1, r1, InvocationCounter::count_increment);
+    __ mov(rscratch1, InvocationCounter::count_mask_value);
+    __ andr(r0, r0, rscratch1);
+
+    __ str(r1, invocation_counter);
+    __ add(r0, r0, r1);                // add both counters
+
+    // profile_method is non-null only for interpreted method so
+    // profile_method != NULL == !native_call
+
+    if (ProfileInterpreter && profile_method != NULL) {
+      // Test to see if we should create a method data oop
+      __ ldr(rscratch2, Address(rmethod, Method::method_counters_offset()));
+      __ ldr(rscratch2, Address(rscratch2, in_bytes(MethodCounters::interpreter_profile_limit_offset())));
+      __ cmp(r0, rscratch2);
+      __ b(*profile_method_continue, Assembler::LT);
+
+      // if no method data exists, go to profile_method
+      __ test_method_data_pointer(rscratch2, *profile_method);
+    }
+
+    {
+      __ ldr(rscratch2, Address(rmethod, Method::method_counters_offset()));
+      __ ldr(rscratch2, Address(rscratch2, in_bytes(MethodCounters::interpreter_invocation_limit_offset())));
+      __ cmp(r0, rscratch2);
+      __ b(*overflow, Assembler::HS);
+    }
+    __ bind(done);
+  }
+}
+
+void TemplateInterpreterGenerator::generate_counter_overflow(Label& do_continue) {
+
+  // Asm interpreter on entry
+  // On return (i.e. jump to entry_point) [ back to invocation of interpreter ]
+  // Everything as it was on entry
+
+  // InterpreterRuntime::frequency_counter_overflow takes two
+  // arguments, the first (thread) is passed by call_VM, the second
+  // indicates if the counter overflow occurs at a backwards branch
+  // (NULL bcp).  We pass zero for it.  The call returns the address
+  // of the verified entry point for the method or NULL if the
+  // compilation did not complete (either went background or bailed
+  // out).
+  __ mov(c_rarg1, 0);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::frequency_counter_overflow),
+             c_rarg1);
+
+  __ b(do_continue);
+}
+
+// See if we've got enough room on the stack for locals plus overhead
+// below JavaThread::stack_overflow_limit(). If not, throw a StackOverflowError
+// without going through the signal handler, i.e., reserved and yellow zones
+// will not be made usable. The shadow zone must suffice to handle the
+// overflow.
+// The expression stack grows down incrementally, so the normal guard
+// page mechanism will work for that.
+//
+// NOTE: Since the additional locals are also always pushed (wasn't
+// obvious in generate_method_entry) so the guard should work for them
+// too.
+//
+// Args:
+//      r3: number of additional locals this frame needs (what we must check)
+//      rmethod: Method*
+//
+// Kills:
+//      r0
+void TemplateInterpreterGenerator::generate_stack_overflow_check(void) {
+
+  // monitor entry size: see picture of stack set
+  // (generate_method_entry) and frame_amd64.hpp
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  // total overhead size: entry_size + (saved rbp through expr stack
+  // bottom).  be sure to change this if you add/subtract anything
+  // to/from the overhead area
+  const int overhead_size =
+    -(frame::get_interpreter_frame_initial_sp_offset() * wordSize) + entry_size;
+
+  const int page_size = os::vm_page_size();
+
+  Label after_frame_check;
+
+  // see if the frame is greater than one page in size. If so,
+  // then we need to verify there is enough stack space remaining
+  // for the additional locals.
+  //
+  __ mov(rscratch1, (page_size - overhead_size) / Interpreter::stackElementSize);
+  __ cmp(r3, rscratch1);
+  __ b(after_frame_check, Assembler::LS);
+
+  // compute rsp as if this were going to be the last frame on
+  // the stack before the red zone
+
+  // locals + overhead, in bytes
+  __ mov(r0, overhead_size);
+  __ add(r0, r0, r3, lsl(Interpreter::logStackElementSize));  // 1 slot per parameter.
+
+  const Address stack_limit(rthread, JavaThread::stack_overflow_limit_offset());
+  __ ldr(rscratch1, stack_limit);
+
+#ifdef ASSERT
+  Label limit_okay;
+  // Verify that thread stack limit is non-zero.
+  __ cbnz(rscratch1, limit_okay);
+  __ stop("stack overflow limit is zero");
+  __ bind(limit_okay);
+#endif
+
+  // Add stack limit to locals.
+  __ add(r0, r0, rscratch1);
+
+  // Check against the current stack bottom.
+  __ cmp(sp, r0);
+  __ b(after_frame_check, Assembler::HI);
+
+  // Remove the incoming args, peeling the machine SP back to where it
+  // was in the caller.  This is not strictly necessary, but unless we
+  // do so the stack frame may have a garbage FP; this ensures a
+  // correct call stack that we can always unwind.
+  __ mov(sp, r4);
+
+  // Note: the restored frame is not necessarily interpreted.
+  // Use the shared runtime version of the StackOverflowError.
+  assert(StubRoutines::throw_StackOverflowError_entry() != NULL, "stub not yet generated");
+  __ far_jump(RuntimeAddress(StubRoutines::throw_StackOverflowError_entry()));
+
+  // all done with frame size check
+  __ bind(after_frame_check);
+}
+
+// Allocate monitor and lock method (asm interpreter)
+//
+// Args:
+//      rmethod: Method*
+//      rlocals: locals
+//
+// Kills:
+//      r0
+//      c_rarg0, c_rarg1, c_rarg2, c_rarg3, ...(param regs)
+//      rscratch1, rscratch2 (scratch regs)
+void TemplateInterpreterGenerator::lock_method(void) {
+  // synchronize method
+  const Address access_flags(rmethod, Method::access_flags_offset());
+  const Address monitor_block_top(
+        rfp,
+        frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldr(r0, access_flags);
+    __ tst(r0, JVM_ACC_SYNCHRONIZED);
+    __ b(L, Assembler::NE);
+    __ stop("method doesn't need synchronization");
+    __ bind(L);
+  }
+#endif // ASSERT
+
+  // get synchronization object
+  {
+    Label done;
+    __ ldr(r0, access_flags);
+    __ tst(r0, JVM_ACC_STATIC);
+    // get receiver (assume this is frequent case)
+    __ ldr(r0, Address(rlocals, Interpreter::local_offset_in_bytes(0)));
+    __ b(done, Assembler::EQ);
+    __ load_mirror(r0, rmethod, r1);
+
+#ifdef ASSERT
+    {
+      Label L;
+      __ cbnz(r0, L);
+      __ stop("synchronization object is NULL");
+      __ bind(L);
+    }
+#endif // ASSERT
+
+    __ bind(done);
+  }
+
+  // add space for monitor & lock
+  __ sub(sp, sp, entry_size); // add space for a monitor entry
+  __ mov(rscratch1, sp);
+  __ str(rscratch1, monitor_block_top);  // set new monitor block top
+  // store object
+  __ str(r0, Address(sp, BasicObjectLock::obj_offset_in_bytes()));
+  __ mov(c_rarg1, sp); // object address
+  __ lock_object(c_rarg1);
+}
+
+// Generate a fixed interpreter frame. This is identical setup for
+// interpreted methods and for native methods hence the shared code.
+//
+// Args:
+//      lr: return address
+//      rmethod: Method*
+//      rlocals: pointer to locals
+//      stack_pointer: previous sp
+//      r4 contains the sender sp
+void TemplateInterpreterGenerator::generate_fixed_frame(bool native_call) {
+  // initialize fixed part of activation frame
+  __ reg_printf("About to print native entry, rmethod = %p\n", rmethod);
+  __ print_method_entry(rmethod, native_call);
+
+  const int int_frame_size = 10;
+  const int common_frame_size = int_frame_size + frame::get_frame_size();
+  const int frame_size = native_call ? common_frame_size + 2 : common_frame_size;
+
+  if (native_call) {
+    // add 2 zero-initialized slots for native calls
+    __ sub(sp, sp, 2 * wordSize);
+    __ mov(rbcp, 0);
+    __ strd(rbcp, rbcp, Address(sp));
+  } else {
+    __ ldr(rscratch1, Address(rmethod, Method::const_offset()));      // get ConstMethod
+    __ add(rbcp, rscratch1, in_bytes(ConstMethod::codes_offset())); // get codebase
+  }
+
+  __ enter();
+  __ sub(sp, sp, int_frame_size * wordSize);
+
+  __ strd(sp, rbcp, Address(sp));
+
+  if (ProfileInterpreter) {
+    Label method_data_continue;
+    __ ldr(rscratch1, Address(rmethod, Method::method_data_offset()));
+    __ cbz(rscratch1, method_data_continue);
+    __ lea(rscratch1, Address(rscratch1, in_bytes(MethodData::data_offset())));
+    __ bind(method_data_continue);
+    __ strd(rscratch1, rmethod, Address(sp, 6 * wordSize));  // save Method* and mdp (method data pointer)
+  } else {
+    __ mov(rscratch1, 0);
+    __ strd(rscratch1, rmethod, Address(sp, 6 * wordSize));        // save Method* (no mdp)
+  }
+
+  // Get mirror and store it in the frame as GC root for this Method*
+  __ load_mirror(rscratch1, rmethod, rcpool);
+  __ mov(rscratch2, 0);
+  __ strd(rscratch1, rscratch2, Address(sp, 4 * wordSize));
+
+  __ ldr(rcpool, Address(rmethod, Method::const_offset()));
+  __ ldr(rcpool, Address(rcpool, ConstMethod::constants_offset()));
+  __ ldr(rcpool, Address(rcpool, ConstantPool::cache_offset_in_bytes()));
+  __ strd(rlocals, rcpool, Address(sp, 2 * wordSize));
+
+  __ reg_printf("Three-quarters through\n");
+  // set sender sp
+  // leave last_sp as null
+  __ mov(rscratch1, 0);
+  // r4 contains the sender sp
+  __ strd(rscratch1, r4, Address(sp, 8 * wordSize));
+
+  // Move SP out of the way
+  /*if (! native_call) {
+    __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+    __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+    __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 2);
+    __ sub(rscratch1, sp, rscratch1, lsl(2));
+    __ bic(sp, rscratch1, 0xf);
+  }*/
+  // FIXME This code moves the sp to after the end of the stack - if this is what's happening
+  // some calls out of the VM may need to be patched
+  __ reg_printf("Fully through\n");
+}
+
+// End of helpers
+
+// Various method entries
+//------------------------------------------------------------------------------------------------------------------------
+//
+//
+
+// Method entry for java.lang.ref.Reference.get.
+address TemplateInterpreterGenerator::generate_Reference_get_entry(void) {
+  // Code: _aload_0, _getfield, _areturn
+  // parameter size = 1
+  //
+  // The code that gets generated by this routine is split into 2 parts:
+  //    1. The "intrinsified" code for G1 (or any SATB based GC),
+  //    2. The slow path - which is an expansion of the regular method entry.
+  //
+  // Notes:-
+  // * In the G1 code we do not check whether we need to block for
+  //   a safepoint. If G1 is enabled then we must execute the specialized
+  //   code for Reference.get (except when the Reference object is null)
+  //   so that we can log the value in the referent field with an SATB
+  //   update buffer.
+  //   If the code for the getfield template is modified so that the
+  //   G1 pre-barrier code is executed when the current method is
+  //   Reference.get() then going through the normal method entry
+  //   will be fine.
+  // * The G1 code can, however, check the receiver object (the instance
+  //   of java.lang.Reference) and jump to the slow path if null. If the
+  //   Reference object is null then we obviously cannot fetch the referent
+  //   and so we don't need to call the G1 pre-barrier. Thus we can use the
+  //   regular method entry code to generate the NPE.
+  //
+  // This code is based on generate_accessor_entry.
+  //
+  // rmethod: Method*
+  // r13: senderSP must preserve for slow path, set SP to it on fast path
+
+  // LR is live.  It must be saved around calls.
+
+  address entry = __ pc();
+
+  const int referent_offset = java_lang_ref_Reference::referent_offset;
+  guarantee(referent_offset > 0, "referent offset not initialized");
+
+  Label slow_path;
+  const Register local_0 = c_rarg0;
+  // Check if local 0 != NULL
+  // If the receiver is null then it is OK to jump to the slow path.
+  __ ldr(local_0, Address(sp, 0));
+  __ cbz(local_0, slow_path);
+
+  // Load the value of the referent field.
+  const Address field_address(local_0, referent_offset);
+  BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
+  bs->load_word_at(_masm, IN_HEAP | ON_WEAK_OOP_REF, T_OBJECT, local_0, field_address, /*tmp1*/ rscratch2, /*tmp2*/ rscratch1);
+
+  // areturn
+  __ mov(sp, r4);           // set sp to sender sp
+  __ b(lr);
+
+  // generate a vanilla interpreter entry as the slow path
+  __ bind(slow_path);
+  __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+  return entry;
+}
+
+void TemplateInterpreterGenerator::bang_stack_shadow_pages(bool native_call) {
+  // Bang each page in the shadow zone. We can't assume it's been done for
+  // an interpreter frame with greater than a page of locals, so each page
+  // needs to be checked.  Only true for non-native.
+  if (UseStackBanging) {
+    const int n_shadow_pages = JavaThread::stack_shadow_zone_size() / os::vm_page_size();
+    const int start_page = native_call ? n_shadow_pages : 1;
+    const int page_size = os::vm_page_size();
+    __ mov(rscratch1, 0);
+    for (int pages = start_page; pages <= n_shadow_pages ; pages++) {
+      __ sub(rscratch2, sp, pages*page_size);
+      __ str(rscratch1, Address(rscratch2));
+    }
+  }
+}
+
+
+// Interpreter stub for calling a native method. (asm interpreter)
+// This sets up a somewhat different looking stack for calling the
+// native method than the typical interpreter frame setup.
+address TemplateInterpreterGenerator::generate_native_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter  = UseCompiler || CountCompiledCalls || LogTouchedMethods;
+
+  // r1: Method*
+  // r4: sender sp
+
+  address entry_point = __ pc();
+  __ reg_printf("entering generate_native_entry, lr = %p, rfp = %p\n\tRBCP = %p\n", lr, rfp, rbcp);
+
+  const Address constMethod       (rmethod, Method::const_offset());
+  const Address access_flags      (rmethod, Method::access_flags_offset());
+  const Address size_of_parameters(r2, ConstMethod::
+                                       size_of_parameters_offset());
+
+  // get parameter size (always needed)
+  __ ldr(r2, constMethod);
+  __ load_unsigned_short(r2, size_of_parameters);
+
+  // Native calls don't need the stack size check since they have no
+  // expression stack and the arguments are already on the stack and
+  // we only add a handful of words to the stack.
+
+  // rmethod: Method*
+  // r2: size of parameters
+  // r4: sender sp
+
+  // for natives the size of locals is zero
+
+  // compute beginning of parameters (rlocals)
+  __ add(rlocals, sp, r2, lsl(2));
+  __ sub(rlocals, rlocals, wordSize);
+  __ reg_printf("(start of parameters) rlocals = %p, nparams = %d\n", rlocals, r2);
+
+  // initialize fixed part of activation frame
+  generate_fixed_frame(true);
+  __ reg_printf("pushed new fixed frame, lr = %p, rfp = %p\n", lr, rfp);
+
+  Register locals_sp = r4; // the overwrites rdispatch, we can restore at end
+  // !! If this canges, change the end of arguements in interpreterRT_aarch32.cpp
+  //__ mov(r4, sp); //Save top of arguments
+
+  // make sure method is native & not abstract
+#ifdef ASSERT
+  __ ldr(r0, access_flags);
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_NATIVE);
+    __ b(L, Assembler::NE);
+    __ stop("tried to execute non-native method as native");
+    __ bind(L);
+  }
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_ABSTRACT);
+    __ b(L, Assembler::EQ);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // Since at this point in the method invocation the exception
+  // handler would try to exit the monitor of synchronized methods
+  // which hasn't been entered yet, we set the thread local variable
+  // _do_not_unlock_if_synchronized to true. The remove_activation
+  // will check this flag.
+
+   const Address do_not_unlock_if_synchronized(rthread,
+        in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  __ mov(rscratch2, true);
+  __ strb(rscratch2, do_not_unlock_if_synchronized);
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  if (inc_counter) {
+    generate_counter_incr(&invocation_counter_overflow, NULL, NULL);
+  }
+
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  bang_stack_shadow_pages(true);
+  // Note rscratch1 will contain zero here due to bang_stack_shadow_pages
+  // reset the _do_not_unlock_if_synchronized flag
+  //__ mov(rscratch1, 0);
+  __ strb(rscratch1, do_not_unlock_if_synchronized);
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  if (synchronized) {
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+    {
+      Label L;
+      __ ldr(r0, access_flags);
+      __ tst(r0, JVM_ACC_SYNCHRONIZED);
+      __ b(L, Assembler::EQ);
+      __ stop("method needs synchronization");
+      __ bind(L);
+    }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  {
+    Label L;
+    const Address monitor_block_top(rfp,
+                 frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+    __ ldr(rscratch1, monitor_block_top);
+    __ cmp(sp, rscratch1);
+    __ b(L, Assembler::EQ);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // jvmti support
+  __ notify_method_entry();
+
+  const Register result_handler = rlocals;
+  //This is recomputed for the new function and result_handler is not written until
+  // after the function has been called
+
+  // allocate space for parameters
+  __ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ load_unsigned_short(rscratch1, Address(rscratch1, ConstMethod::size_of_parameters_offset()));
+
+  __ sub(sp, sp, rscratch1, lsl(Interpreter::logStackElementSize + 1));
+  // This +1 is a hack to double the amount of space allocated for parameters, this is likely far
+  // more than needed as in the worst case when parameters have to be placed on the stack they would be aligned
+  // as follows LONG | INT | EMPTY | LONG ... This would only increase the space used by a half.
+  __ align_stack();
+  __ mov(locals_sp, sp);
+  __ reg_printf("Stack Pointer on arg copy, sp = %p, locals_sp = %p, rlocals = %p\n", sp, locals_sp, rlocals);
+
+  // get signature handler
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rmethod, Method::signature_handler_offset()));
+    __ cmp(rscratch1, 0);
+    __ b(L, Assembler::NE);
+    __ reg_printf("Prepare_native_call, locals_sp = %p, rlocals = %p\n", locals_sp, rlocals);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                       InterpreterRuntime::prepare_native_call), rmethod);
+    __ reg_printf("Finished prepare_native_call, locals_sp = %p, rlocals = %p\n", locals_sp, rlocals);
+    __ ldr(rscratch1, Address(rmethod, Method::signature_handler_offset()));
+    __ bind(L);
+  }
+
+  // call signature handler
+  assert(InterpreterRuntime::SignatureHandlerGenerator::from() == rlocals,
+         "adjust this code");
+  assert(InterpreterRuntime::SignatureHandlerGenerator::to() == locals_sp,
+         "adjust this code");
+  assert(InterpreterRuntime::SignatureHandlerGenerator::temp() == rscratch1,
+          "adjust this code");
+
+  // The generated handlers do not touch rmethod (the method).
+  // However, large signatures cannot be cached and are generated
+  // each time here.  The slow-path generator can do a GC on return,
+  // so we must reload it after the call.
+  __ reg_printf("**BEFORE**\nrlocals = %p,locals_sp = %p, sp = %p\n", rlocals, locals_sp, sp);
+  __ reg_printf("About to call the Method::signature_handler = %p\n", rscratch1);
+  __ bl(rscratch1);
+  __ reg_printf("**AFTER**\nr0 : %p, r1 : %p, r2 : %p\n", r0, r1, r2);
+  __ reg_printf("r3 : %p, sp : %p\n", r3, sp);
+  __ get_method(rmethod);        // slow path can do a GC, reload rmethod
+
+
+
+  // result handler is in r0
+  // set result handler
+  __ mov(result_handler, r0);
+  // pass mirror handle if static call
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rmethod, Method::access_flags_offset()));
+    __ tst(rscratch1, JVM_ACC_STATIC);
+    __ b(L, Assembler::EQ);
+    // get mirror
+    __ load_mirror(rscratch1, rmethod, r1);
+    // copy mirror into activation frame
+    __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_oop_temp_offset() * wordSize));
+    // pass handle to mirror
+    __ add(c_rarg1, rfp, frame::get_interpreter_frame_oop_temp_offset() * wordSize);
+    __ bind(L);
+  }
+
+  // get native function entry point in r14
+  Register native_entry_point = r14;
+
+  {
+    Label L;
+    __ ldr(native_entry_point, Address(rmethod, Method::native_function_offset()));
+    address unsatisfied = (SharedRuntime::native_method_throw_unsatisfied_link_error_entry());
+    __ mov(rscratch2, unsatisfied);
+    __ ldr(rscratch2, rscratch2);
+    __ reg_printf("QWERTY native_entry_point = %p, unsatisfied_link_entry_point = %p\n", native_entry_point, rscratch2);
+    __ cmp(native_entry_point, rscratch2);
+    __ b(L, Assembler::NE);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                       InterpreterRuntime::prepare_native_call), rmethod);
+    __ get_method(rmethod);
+    __ ldr(native_entry_point, Address(rmethod, Method::native_function_offset()));
+    __ bind(L);
+  }
+
+  // pass JNIEnv
+  __ add(c_rarg0, rthread, in_bytes(JavaThread::jni_environment_offset()));
+
+  // It is enough that the pc() points into the right code
+  // segment. It does not have to be the correct return pc.
+  __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
+
+  // change thread state
+#ifdef ASSERT
+  {
+    Label L;
+    __ ldr(rscratch1, Address(rthread, JavaThread::thread_state_offset()));
+    __ cmp(rscratch1, _thread_in_Java);
+    __ b(L, Assembler::EQ);
+    __ stop("Wrong thread state in native stub");
+    __ bind(L);
+  }
+#endif
+
+  // Change state to native
+  __ mov(rscratch1, _thread_in_native);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, Address(rscratch2));
+
+  __ reg_printf("Calling native method, lr = %p & rmethod = %p\n", lr, rmethod);
+  // Call the native method.
+  /*__ reg_printf("**ONCALL**\nr0 : %p\nr1 : %p\nr2 : %p\n", r0, r1, r2);
+  __ reg_printf("r3 : %p\n\nr4 : %p\nrloc : %p\n", r3, r4, rlocals);*/
+  __ reg_printf("Stack Pointer on entry to native, sp = %p\n", sp);
+  __ bl(native_entry_point);
+  __ reg_printf("Returned from native, lr = %p, r1 = %p, r0 = %p\n", lr, r1, r0);
+  __ maybe_isb();
+  __ get_method(rmethod);
+  // result potentially in r0, <r0:r1> or v0
+
+  // make room for the pushes we're about to do
+  //__ sub(rscratch1, sp, 4 * wordSize);
+  //__ bic(sp, rscratch1, 0xf);
+  // NOTE: The order of these pushes is known to frame::interpreter_frame_result
+  // in order to extract the result of a method call. If the order of these
+  // pushes change or anything else is added to the stack then the code in
+  // interpreter_frame_result must also change.
+  __ reg_printf("Before push dtos, ltos. sp = %p\n", sp);
+  __ push(dtos);
+  __ push(ltos);
+
+  // change thread state
+  __ mov(rscratch1, _thread_in_native_trans);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, Address(rscratch2));
+  __ reg_printf("before os::is_MP\n");
+  if (os::is_MP()) {
+    if (UseMembar) {
+      // Force this write out before the read below
+      __ membar(Assembler::AnyAny);
+    } else {
+      // Write serialization page so VM thread can do a pseudo remote membar.
+      // We use the current thread pointer to calculate a thread specific
+      // offset to write to within the page. This minimizes bus traffic
+      // due to cache line collision.
+      __ serialize_memory(rthread, rscratch2);
+    }
+  }
+  __ reg_printf("after os::is_MP\n");
+  // check for safepoint operation in progress and/or pending suspend requests
+  {
+    Label L, Continue;
+    __ safepoint_poll_acquire(L);
+    __ ldr(rscratch2, Address(rthread, JavaThread::suspend_flags_offset()));
+    __ cbz(rscratch2, Continue);
+    __ bind(L);
+
+    // Don't use call_VM as it will see a possible pending exception
+    // and forward it and never return here preventing us from
+    // clearing _last_native_pc down below. So we do a runtime call by
+    // hand.
+    //
+    __ mov(c_rarg0, rthread);
+    __ mov(rscratch2, CAST_FROM_FN_PTR(address, JavaThread::check_special_condition_for_native_trans));
+    //__ blrt(rscratch2, 1, 0, 0);
+    __ bl(rscratch2);
+    __ maybe_isb();
+    __ get_method(rmethod);
+    __ bind(Continue);
+  }
+  __ reg_printf("finished safepoint check\n");
+  // change thread state
+  __ mov(rscratch1, _thread_in_Java);
+  __ lea(rscratch2, Address(rthread, JavaThread::thread_state_offset()));
+  __ dmb(Assembler::ISH);
+  __ str(rscratch1, Address(rscratch2));
+
+  // reset_last_Java_frame
+  __ reset_last_Java_frame(true);
+
+  __ mov(rscratch1, 0);
+  if (CheckJNICalls) {
+    // clear_pending_jni_exception_check
+    __ str(rscratch1, Address(rthread, JavaThread::pending_jni_exception_check_fn_offset()));
+  }
+
+  // reset handle block
+  __ ldr(rscratch2, Address(rthread, JavaThread::active_handles_offset()));
+  __ str(rscratch1, Address(rscratch2, JNIHandleBlock::top_offset_in_bytes()));
+
+  // If result is an oop unbox and store it in frame where gc will see it
+  // and result handler will pick it up
+  __ reg_printf("finished checking last_Java_frame\n");
+  {
+    Label no_oop, not_weak, store_result;
+    //__ bkpt(345);
+    //__ adr(rscratch2, ExternalAddress(AbstractInterpreter::result_handler(T_OBJECT)));
+    __ mov(rscratch2, ExternalAddress(AbstractInterpreter::result_handler(T_OBJECT)));
+    __ reg_printf("Comparing rscratch2 = %p and result_handler = %p\n", rscratch2, result_handler);
+
+    __ cmp(rscratch2, result_handler);
+    __ b(no_oop, Assembler::NE);
+    __ reg_printf("It's an oop.\n");
+    // Unbox oop result, e.g. JNIHandles::resolve result.
+    __ pop(ltos);
+    __ resolve_jobject(r0, rthread, rscratch2);
+    __ str(r0, Address(rfp, frame::get_interpreter_frame_oop_temp_offset()*wordSize));
+    // keep stack depth as expected by pushing oop which will eventually be discarded
+    __ push(ltos);
+    __ bind(no_oop);
+  }
+
+  {
+    Label no_reguard;
+    __ lea(rscratch1, Address(rthread, in_bytes(JavaThread::stack_guard_state_offset())));
+    __ ldrb(rscratch1, Address(rscratch1));
+    __ cmp(rscratch1, JavaThread::stack_guard_yellow_reserved_disabled);
+    __ b(no_reguard, Assembler::NE);
+
+    __ pusha(); // XXX only save smashed registers
+    __ mov(c_rarg0, rthread);
+    __ mov(rscratch2, CAST_FROM_FN_PTR(address, SharedRuntime::reguard_yellow_pages));
+    __ bl(rscratch2);
+    __ popa(); // XXX only restore smashed registers
+    __ bind(no_reguard);
+  }
+  __ reg_printf("Restoring java-ish things\n");
+  // The method register is junk from after the thread_in_native transition
+  // until here.  Also can't call_VM until the bcp has been
+  // restored.  Need bcp for throwing exception below so get it now.
+  __ get_method(rmethod);
+  __ get_dispatch(); // used to save sp in for args
+  // restore bcp to have legal interpreter frame, i.e., bci == 0 <=>
+  // rbcp == code_base()
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));   // get ConstMethod*
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));          // get codebase
+  // handle exceptions (exception handling will handle unlocking!)
+  {
+    Label L;
+    __ reg_printf("Checking pending exceptions\n");
+    __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
+    __ cbz(rscratch1, L);
+    // Note: At some point we may want to unify this with the code
+    // used in call_VM_base(); i.e., we should use the
+    // StubRoutines::forward_exception code. For now this doesn't work
+    // here because the rsp is not correctly set at this point.
+    __ reg_printf("Calling vm to throw_pending_exception\n");
+
+    // Need to restore lr? - introduced on aarch32 port
+    //__ ldr(lr, Address(rfp, frame::get_return_addr_offset()));
+
+    __ MacroAssembler::call_VM(noreg,
+                               CAST_FROM_FN_PTR(address,
+                               InterpreterRuntime::throw_pending_exception));
+    __ should_not_reach_here();
+    __ bind(L);
+  }
+
+  // do unlocking if necessary
+  {
+    Label L;
+    __ reg_printf("testing if we need to unlock\n");
+    __ ldr(rscratch1, Address(rmethod, Method::access_flags_offset()));
+    __ tst(rscratch1, JVM_ACC_SYNCHRONIZED);
+    __ b(L, Assembler::EQ);
+    // the code below should be shared with interpreter macro
+    // assembler implementation
+    {
+      Label unlock;
+      // BasicObjectLock will be first in list, since this is a
+      // synchronized method. However, need to check that the object
+      // has not been unlocked by an explicit monitorexit bytecode.
+
+      // monitor expect in c_rarg1 for slow unlock path
+      __ lea (c_rarg1, Address(rfp,   // address of first monitor
+                               (intptr_t)(frame::get_interpreter_frame_initial_sp_offset() *
+                                          wordSize - sizeof(BasicObjectLock))));
+
+      __ ldr(rscratch1, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+      __ reg_printf("Checking if we are already unlocked\n");
+      __ cbnz(rscratch1, unlock);
+
+      // Entry already unlocked, need to throw exception
+      __ MacroAssembler::call_VM(noreg,
+                                 CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+      __ should_not_reach_here();
+
+      __ bind(unlock);
+      __ reg_printf("Doing unlock\n");
+      __ unlock_object(c_rarg1);
+    }
+    __ bind(L);
+  }
+  __ reg_printf("finished unlocking\n");
+  // jvmti support
+  // Note: This must happen _after_ handling/throwing any exceptions since
+  //       the exception handler code notifies the runtime of method exits
+  //       too. If this happens before, method entry/exit notifications are
+  //       not properly paired (was bug - gri 11/22/99).
+  __ notify_method_exit(vtos, InterpreterMacroAssembler::NotifyJVMTI);
+
+  // restore potential result in r0:d0, call result handler to
+  // restore potential result in ST0 & handle result
+  __ reg_printf("Before pop dtos, ltos. sp = %p\n", sp);
+  __ pop(ltos);
+  __ pop(dtos);
+
+  __ reg_printf("Calling result handler, r1 = %p, r0 = %p\n", r1, r0);
+  __ bl(result_handler);
+  __ reg_printf("Finished result_handler\n RFP NOW = %p, r0 = %p\n", rfp, r0);
+
+  // remove activation restore sp to sender_sp
+  __ ldr(rscratch1, Address(rfp,
+                    frame::get_interpreter_frame_sender_sp_offset() *
+                    wordSize)); // get sender sp
+  // remove frame anchor & restore sp
+  __ leave();
+
+  __ mov(sp, rscratch1); // Native frame so two extra fields
+  __ reg_printf("Returning to Java execution, restored frame = %p, lr = %p\n\tRBCP = %p\n", rfp, lr, rbcp);
+  __ b(lr);
+
+  if (inc_counter) {
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(continue_after_compile);
+  }
+
+  return entry_point;
+}
+
+address TemplateInterpreterGenerator::generate_CRC32_update_entry() {
+  if (UseCRC32Intrinsics) {
+    address entry = __ pc();
+
+    // rmethod: Method*
+    // sp: args
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    __ safepoint_poll(slow_path);
+
+    // We don't generate local frame and don't align stack because
+    // we call stub code and there is no safepoint on this path.
+
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register val = c_rarg1;  // source java byte value
+    const Register tbl = c_rarg2;  // scratch
+
+    // Arguments are reversed on java expression stack
+    __ ldr(val, Address(sp, 0));              // byte value
+    __ ldr(crc, Address(sp, wordSize));       // Initial CRC
+
+    __ lea(tbl, ExternalAddress(StubRoutines::crc_table_addr()));
+    __ inv(crc, crc);
+    __ update_byte_crc32(crc, val, tbl);
+    __ inv(crc, crc); // result in c_rarg0
+
+    __ mov(sp, r4);
+    __ ret(lr);
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+
+    return entry;
+  }
+  return NULL;
+}
+
+address TemplateInterpreterGenerator::generate_CRC32_updateBytes_inner(AbstractInterpreter::MethodKind kind, int is_crc32c) {
+  if (!is_crc32c ? UseCRC32Intrinsics : UseCRC32CIntrinsics) {
+    address entry = __ pc();
+
+    // rmethod,: Method*
+    // sp: senderSP must preserved for slow path
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    __ safepoint_poll(slow_path);
+
+    // We don't generate local frame and don't align stack because
+    // we call stub code and there is no safepoint on this path.
+
+    // Load parameters
+    const Register crc = c_rarg0;  // crc
+    const Register buf = c_rarg1;  // source java byte array address
+    const Register len = c_rarg2;  // length
+    const Register off = len;      // offset (never overlaps with 'len')
+    const Register tmp = rscratch1;// tmp register used to load end in case crc32c
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    if (!is_crc32c ? kind == Interpreter::java_util_zip_CRC32_updateByteBuffer :
+                     kind == Interpreter::java_util_zip_CRC32C_updateDirectByteBuffer) {
+      __ ldr(buf, Address(sp, 2*wordSize)); // long buf
+      __ ldr(off, Address(sp, wordSize)); // offset
+      __ add(buf, buf, off); // + offset
+      __ ldr(crc, Address(sp, 4*wordSize)); // Initial CRC
+    } else {
+      __ ldr(buf, Address(sp, 2*wordSize)); // byte[] array
+      __ add(buf, buf, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+      __ ldr(off, Address(sp, wordSize)); // offset
+      __ add(buf, buf, off); // + offset
+      __ ldr(crc, Address(sp, 3*wordSize)); // Initial CRC
+    }
+
+    // Can now load 'len' since we're finished with 'off'
+    if (!is_crc32c) {
+      __ ldr(len, Address(sp)); // Length
+    } else {
+      __ ldr(tmp, Address(sp));
+      // len = end - offset
+      __ sub(len, tmp, off);
+    }
+
+    __ mov(sp, r4); // Restore the caller's SP
+
+    // We are frameless so we can just jump to the stub.
+    __ b(CAST_FROM_FN_PTR(address, !is_crc32c ? StubRoutines::updateBytesCRC32() :
+                                                StubRoutines::updateBytesCRC32C()));
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+
+    if (!is_crc32c)
+      __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::native));
+    else
+      __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+
+    return entry;
+  }
+  return NULL;
+}
+
+address TemplateInterpreterGenerator::generate_CRC32_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+    return generate_CRC32_updateBytes_inner(kind, false);
+}
+address TemplateInterpreterGenerator::generate_CRC32C_updateBytes_entry(AbstractInterpreter::MethodKind kind) {
+    return generate_CRC32_updateBytes_inner(kind, true);
+}
+
+address TemplateInterpreterGenerator::generate_aescrypt_block_entry(AbstractInterpreter::MethodKind kind) {
+  // TODO enable once class fields offsets are known at this point
+  if (false && UseAESIntrinsics) {
+    const int K_offset = com_sun_crypto_provider_AESCrypt::K_offset();
+    guarantee(K_offset > 0, "referent offset not initialized");
+
+    address entry = __ pc();
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    __ safepoint_poll(slow_path);
+
+    // Load parameters
+    const Register from = c_rarg0; // source java byte array address
+    const Register to = c_rarg1; // source java byte array address
+    const Register key = c_rarg2; // source java byte array address
+    const Register off = c_rarg3; // offset (never overlaps with 'len')
+
+    // Arguments are reversed on java expression stack
+    // Calculate address of start element
+    __ ldr(off, Address(sp)); // to buffer offset
+    __ ldr(to, Address(sp, wordSize)); // to buffer
+    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+    __ add(to, to, off);
+    __ ldr(off, Address(sp, 2 * wordSize)); // from buffer offset
+    __ ldr(from, Address(sp, 3 * wordSize)); // from buffer
+    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+    __ add(from, from, off);
+    // Load the value of the referent field.
+    __ ldr(key, Address(sp, 4 * wordSize)); // object itself
+    const Address field_address(key, K_offset);
+    __ load_heap_oop(key, field_address);
+    __ add(key, key, arrayOopDesc::base_offset_in_bytes(T_INT)); // + header size
+
+    __ mov(sp, r4); // Restore the caller's SP
+
+    if (kind == Interpreter::com_sun_crypto_provider_AESCrypt_encryptBlock) {
+      // We are frameless so we can just jump to the stub.
+      __ b(CAST_FROM_FN_PTR(address, StubRoutines::aescrypt_encryptBlock()));
+    } else {
+      // We are frameless so we can just jump to the stub.
+      __ b(CAST_FROM_FN_PTR(address, StubRoutines::aescrypt_decryptBlock()));
+    }
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+
+    return entry;
+  }
+  return NULL;
+}
+
+address TemplateInterpreterGenerator::generate_cipherBlockChaining_encryptAESCrypt_entry(AbstractInterpreter::MethodKind kind) {
+  // TODO enable once class fields offsets are known at this point
+  if (false && UseAESIntrinsics && UseNeon) {
+    address entry = __ pc();
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    __ safepoint_poll(slow_path);
+
+    const int embeddedCipher_offset = com_sun_crypto_provider_FeedbackCipher::embeddedCipher_offset();
+    guarantee(embeddedCipher_offset > 0, "referent offset not initialized");
+    const int K_offset = com_sun_crypto_provider_AESCrypt::K_offset();
+    guarantee(K_offset > 0, "referent offset not initialized");
+    const int r_offset = com_sun_crypto_provider_CipherBlockChaining::r_offset();
+    guarantee(r_offset > 0, "referent offset not initialized");
+
+    // Load parameters
+    const Register from = c_rarg0; // source java byte array address
+    const Register to = c_rarg1; // dest java byte array address
+    const Register key = c_rarg2; // key java byte array address
+    const Register rvec = c_rarg3; // rvec java byte array address
+    const Register len = r4; // len of the input
+    const Register off = r5; // offset
+    const Register sp_pointer = r6; // sp
+
+    __ mov(sp_pointer, r4);
+    // Arguments are reversed on java expression stack:
+    // outBuffer offset, outBuffer, inBuffer len, inBuffer len , inBuffer
+    // Calculate address of start element
+    __ ldr(off, Address(sp)); // to buffer offset
+    __ ldr(to, Address(sp, wordSize)); // to buffer
+    __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+    __ add(to, to, off);
+    __ ldr(len, Address(sp, 2 * wordSize)); // len
+    __ ldr(off, Address(sp, 3 * wordSize)); // from buffer offset
+    __ ldr(from, Address(sp, 4 * wordSize)); // from buffer
+    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+    __ add(from, from, off);
+    // Load the value of the referent field.
+    __ ldr(rvec, Address(sp, 5 * wordSize)); // object itself
+    const Address field_address(rvec, r_offset);
+    __ load_heap_oop(rvec, field_address);
+    __ add(rvec, rvec, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+
+    __ ldr(key, Address(sp, 5 * wordSize)); // object itself
+    const Address field_address2(key, embeddedCipher_offset);
+    __ load_heap_oop(key, field_address2);
+    const Address field_address3(key, K_offset);
+    __ load_heap_oop(key, field_address3);
+    __ add(key, key, arrayOopDesc::base_offset_in_bytes(T_INT)); // + header size
+
+    __ mov(sp, sp_pointer); // Restore the caller's SP
+
+    if (kind == Interpreter::com_sun_crypto_provider_CipherBlockChaining_encrypt) {
+      // We are frameless so we can just jump to the stub.
+      __ b(CAST_FROM_FN_PTR(address, StubRoutines::cipherBlockChaining_encryptAESCrypt_special()));
+    } else {
+      // We are frameless so we can just jump to the stub.
+      __ b(CAST_FROM_FN_PTR(address, StubRoutines::cipherBlockChaining_decryptAESCrypt_special()));
+    }
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+
+    return entry;
+  }
+  return NULL;
+}
+
+address TemplateInterpreterGenerator::generate_SHA_implCompress_entry(AbstractInterpreter::MethodKind kind) {
+  // TODO enable once class fields offsets are known at this point
+  if (false && ((UseSHA1Intrinsics && kind == Interpreter::sun_security_provider_SHA_implCompress) ||
+      (UseSHA256Intrinsics && kind == Interpreter::sun_security_provider_SHA2_implCompress) ||
+      (UseSHA512Intrinsics && kind == Interpreter::sun_security_provider_SHA5_implCompress))) {
+    address entry = __ pc();
+
+    Label slow_path;
+    // If we need a safepoint check, generate full interpreter entry.
+    __ safepoint_poll(slow_path);
+
+    int state_offset;
+    int state_data_offset;
+    address stub_addr;
+    switch (kind) {
+      case Interpreter::sun_security_provider_SHA_implCompress:
+        state_offset = sun_security_provider_SHA::state_offset();
+        state_data_offset = arrayOopDesc::base_offset_in_bytes(T_INT);
+        stub_addr = StubRoutines::sha1_implCompress();
+        break;
+      case Interpreter::sun_security_provider_SHA2_implCompress:
+        state_offset = sun_security_provider_SHA2::state_offset();
+        state_data_offset = arrayOopDesc::base_offset_in_bytes(T_INT);
+        stub_addr = StubRoutines::sha256_implCompress();
+        break;
+      case Interpreter::sun_security_provider_SHA5_implCompress:
+        state_offset = sun_security_provider_SHA5::state_offset();
+        state_data_offset = arrayOopDesc::base_offset_in_bytes(T_LONG);
+        stub_addr = StubRoutines::sha512_implCompress();
+        break;
+      default:
+        ShouldNotReachHere(); return NULL; //  cannot be, stupid gcc
+    }
+    guarantee(state_offset > 0, "referent offset not initialized");
+
+    // Load parameters
+    const Register from = c_rarg0; // source java byte array address
+    const Register state  = c_rarg1; // state java byte array address
+    const Register off = r3; // offset
+
+    // Arguments are reversed on java expression stack:
+    // fromBufferOffset , fromBuffer
+    // Calculate address of start element
+    __ ldr(off, Address(sp)); // from buffer offset
+    __ ldr(from, Address(sp, wordSize)); // from buffer
+    __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_BYTE)); // + header size
+    __ add(from, from, off);
+    // Load the value of the referent field.
+    __ ldr(state, Address(sp, 2 * wordSize)); // object itself
+    const Address field_address(state, state_offset);
+    __ load_heap_oop(state, field_address);
+    __ add(state, state, state_data_offset); // + header size
+
+    __ mov(sp, r4); // Restore the caller's SP
+
+    // We are frameless so we can just jump to the stub.
+    __ b(CAST_FROM_FN_PTR(address, stub_addr));
+
+    // generate a vanilla native entry as the slow path
+    __ bind(slow_path);
+
+    __ jump_to_entry(Interpreter::entry_for_kind(Interpreter::zerolocals));
+
+    return entry;
+  }
+  return NULL;
+}
+
+//
+// Generic interpreted method entry to (asm) interpreter
+//
+address TemplateInterpreterGenerator::generate_normal_entry(bool synchronized) {
+  // determine code generation flags
+  bool inc_counter = UseCompiler || CountCompiledCalls || LogTouchedMethods;
+
+  // r4: sender sp
+  address entry_point = __ pc();
+
+  const Address constMethod(rmethod, Method::const_offset());
+  const Address access_flags(rmethod, Method::access_flags_offset());
+  const Address size_of_parameters(r3,
+                                   ConstMethod::size_of_parameters_offset());
+  const Address size_of_locals(r3, ConstMethod::size_of_locals_offset());
+
+  // get parameter size (always needed)
+  // need to load the const method first
+  __ ldr(r3, constMethod);
+  __ load_unsigned_short(r2, size_of_parameters);
+
+  // r2: size of parameters
+
+  __ load_unsigned_short(r3, size_of_locals); // get size of locals in words
+  __ sub(r3, r3, r2); // r3 = no. of additional locals
+
+  // see if we've got enough room on the stack for locals plus overhead.
+  generate_stack_overflow_check();
+
+  // compute beginning of parameters (rlocals)
+  __ add(rlocals, sp, r2, lsl(2));
+  __ sub(rlocals, rlocals, wordSize);
+
+  // Make room for locals
+  __ sub(rscratch1, sp, r3, lsl(2));
+  // Align the sp value
+  __ bic(sp, rscratch1, StackAlignmentInBytes-1);
+
+  // r3 - # of additional locals
+  // allocate space for locals
+  // explicitly initialize locals
+  {
+    Label exit, loop;
+    __ mov(rscratch2, 0);
+    __ cmp(r3, 0);
+    __ b(exit, Assembler::LE); // do nothing if r3 <= 0
+    __ bind(loop);
+    __ str(rscratch2, Address(__ post(rscratch1, wordSize)));
+    __ subs(r3, r3, 1); // until everything initialized
+    __ b(loop, Assembler::NE);
+    __ bind(exit);
+  }
+  __ reg_printf("Done locals space\n", r2);
+
+  // initialize fixed part of activation frame
+  __ reg_printf("About to do fixed frame\n", r2);
+  generate_fixed_frame(false);
+  // And the base dispatch table
+  __ get_dispatch();
+  // make sure method is not native & not abstract
+  __ reg_printf("Just done generate_fixed_frame; rmethod = %p\n", rmethod);
+#ifdef ASSERT
+  __ ldr(r0, access_flags);
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_NATIVE);
+    __ b(L, Assembler::EQ);
+    __ stop("tried to execute native method as non-native");
+    __ bind(L);
+  }
+  {
+    Label L;
+    __ tst(r0, JVM_ACC_ABSTRACT);
+    __ b(L, Assembler::EQ);
+    __ stop("tried to execute abstract method in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // Since at this point in the method invocation the exception
+  // handler would try to exit the monitor of synchronized methods
+  // which hasn't been entered yet, we set the thread local variable
+  // _do_not_unlock_if_synchronized to true. The remove_activation
+  // will check this flag.
+
+   const Address do_not_unlock_if_synchronized(rthread,
+        in_bytes(JavaThread::do_not_unlock_if_synchronized_offset()));
+  __ mov(rscratch2, true);
+  __ strb(rscratch2, do_not_unlock_if_synchronized);
+
+  Label no_mdp;
+  Register mdp = r3;
+  __ ldr(mdp, Address(rmethod, Method::method_data_offset()));
+  __ cbz(mdp, no_mdp);
+  __ add(mdp, mdp, in_bytes(MethodData::data_offset()));
+  __ profile_parameters_type(mdp, r1, r2);
+  __ bind(no_mdp);
+
+  // increment invocation count & check for overflow
+  Label invocation_counter_overflow;
+  Label profile_method;
+  Label profile_method_continue;
+  if (inc_counter) {
+    generate_counter_incr(&invocation_counter_overflow,
+                          &profile_method,
+                          &profile_method_continue);
+    if (ProfileInterpreter) {
+      __ bind(profile_method_continue);
+    }
+  }
+
+  Label continue_after_compile;
+  __ bind(continue_after_compile);
+
+  bang_stack_shadow_pages(false);
+  // Note rscratch1 will contain zero here
+  // reset the _do_not_unlock_if_synchronized flag
+  __ strb(rscratch1, do_not_unlock_if_synchronized);
+
+  // check for synchronized methods
+  // Must happen AFTER invocation_counter check and stack overflow check,
+  // so method is not locked if overflows.
+  if (synchronized) {
+    // Allocate monitor and lock method
+    lock_method();
+  } else {
+    // no synchronization necessary
+#ifdef ASSERT
+    {
+      Label L;
+      __ reg_printf("Checking synchronization, rmethod = %p\n", rmethod);
+      __ ldr(r0, access_flags);
+      __ tst(r0, JVM_ACC_SYNCHRONIZED);
+      __ b(L, Assembler::EQ);
+      __ stop("method needs synchronization");
+      __ bind(L);
+    }
+#endif
+  }
+
+  // start execution
+#ifdef ASSERT
+  {
+    Label L;
+     const Address monitor_block_top (rfp,
+                 frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+    __ ldr(rscratch1, monitor_block_top);
+    __ cmp(sp, rscratch1);
+    __ b(L, Assembler::EQ);
+    __ stop("broken stack frame setup in interpreter");
+    __ bind(L);
+  }
+#endif
+
+  // jvmti support
+  __ notify_method_entry();
+  __ reg_printf("About to dispatch, rmethod = %p, rlocals = %p\n", rmethod, rlocals);
+  __ dispatch_next(vtos);
+  __ reg_printf("Finshed dispatch? rmethod = %p\n", rmethod);
+  // invocation counter overflow
+  if (inc_counter) {
+    if (ProfileInterpreter) {
+      // We have decided to profile this method in the interpreter
+      __ bind(profile_method);
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ set_method_data_pointer_for_bcp();
+      // don't think we need this
+      __ get_method(r1);
+      __ b(profile_method_continue);
+    }
+    // Handle overflow of counter and compile method
+    __ bind(invocation_counter_overflow);
+    generate_counter_overflow(continue_after_compile);
+  }
+
+  __ reg_printf("Just completed normal entry, rmethod = %p\n", rmethod);
+  return entry_point;
+}
+
+//-----------------------------------------------------------------------------
+// Exceptions
+
+void TemplateInterpreterGenerator::generate_throw_exception() {
+  // Entry point in previous activation (i.e., if the caller was
+  // interpreted)
+  Interpreter::_rethrow_exception_entry = __ pc();
+  __ reg_printf("rethrow_exception_entry\n");
+
+  // Restore sp to interpreter_frame_last_sp even though we are going
+  // to empty the expression stack for the exception processing.
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  // r0: exception
+  // r3: return address/pc that threw exception
+  __ restore_bcp();    // rbcp points to call/send
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_dispatch();
+
+  // Entry point for exceptions thrown within interpreter code
+  Interpreter::_throw_exception_entry = __ pc();
+  __ reg_printf("throw_exception_entry\n");
+  // If we came here via a NullPointerException on the receiver of a
+  // method, rmethod may be corrupt.
+  __ get_method(rmethod);
+  // expression stack is undefined here
+  // r0: exception
+  // rbcp: exception bcp
+  __ verify_oop(r0);
+  __ mov(c_rarg1, r0);
+
+  // expression stack must be empty before entering the VM in case of
+  // an exception
+  __ empty_expression_stack();
+  // find exception handler address and preserve exception oop
+  __ call_VM(r3,
+             CAST_FROM_FN_PTR(address,
+                          InterpreterRuntime::exception_handler_for_exception),
+             c_rarg1);
+
+  // Calculate stack limit
+  /*__ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 4);
+  __ ldr(rscratch2,
+         Address(rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, lsl(2));
+  __ bic(sp, rscratch1, 0xf);*/
+  // Don't do this as we don't have a stack pointer
+
+  // r0: exception handler entry point
+  // r3: preserved exception oop
+  // rbcp: bcp for exception handler
+  __ push_ptr(r3); // push exception which is now the only value on the stack
+  __ b(r0); // jump to exception handler (may be _remove_activation_entry!)
+
+  // If the exception is not handled in the current frame the frame is
+  // removed and the exception is rethrown (i.e. exception
+  // continuation is _rethrow_exception).
+  //
+  // Note: At this point the bci is still the bxi for the instruction
+  // which caused the exception and the expression stack is
+  // empty. Thus, for any VM calls at this point, GC will find a legal
+  // oop map (with empty expression stack).
+
+  //
+  // JVMTI PopFrame support
+  //
+
+  Interpreter::_remove_activation_preserving_args_entry = __ pc();
+  __ print_method_exit(false);
+  __ reg_printf("remove_activation_preserving_args_entry\n");
+  __ empty_expression_stack();
+  // Set the popframe_processing bit in pending_popframe_condition
+  // indicating that we are currently handling popframe, so that
+  // call_VMs that may happen later do not trigger new popframe
+  // handling cycles.
+  __ ldr(r3, Address(rthread, JavaThread::popframe_condition_offset()));
+  __ orr(r3, r3, JavaThread::popframe_processing_bit);
+  __ str(r3, Address(rthread, JavaThread::popframe_condition_offset()));
+
+  {
+    // Check to see whether we are returning to a deoptimized frame.
+    // (The PopFrame call ensures that the caller of the popped frame is
+    // either interpreted or compiled and deoptimizes it if compiled.)
+    // In this case, we can't call dispatch_next() after the frame is
+    // popped, but instead must save the incoming arguments and restore
+    // them after deoptimization has occurred.
+    //
+    // Note that we don't compare the return PC against the
+    // deoptimization blob's unpack entry because of the presence of
+    // adapter frames in C2.
+    Label caller_not_deoptimized;
+    __ ldr(c_rarg1, Address(rfp, frame::get_return_addr_offset() * wordSize));
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                               InterpreterRuntime::interpreter_contains), c_rarg1);
+    __ cbnz(r0, caller_not_deoptimized);
+
+    // Compute size of arguments for saving when returning to
+    // deoptimized caller
+    __ get_method(r0);
+    __ ldr(r0, Address(r0, Method::const_offset()));
+    __ load_unsigned_short(r0, Address(r0, in_bytes(ConstMethod::
+                                                    size_of_parameters_offset())));
+    __ lsl(r0, r0, Interpreter::logStackElementSize);
+    __ restore_locals(); // XXX do we need this?
+    __ sub(rlocals, rlocals, r0);
+    __ add(rlocals, rlocals, wordSize);
+    // Save these arguments
+    __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                                           Deoptimization::
+                                           popframe_preserve_args),
+                          rthread, r0, rlocals);
+
+    __ remove_activation(vtos,
+                         /* throw_monitor_exception */ false,
+                         /* install_monitor_exception */ false,
+                         /* notify_jvmdi */ false);
+
+    // Inform deoptimization that it is responsible for restoring
+    // these arguments
+    __ mov(rscratch1, JavaThread::popframe_force_deopt_reexecution_bit);
+    __ str(rscratch1, Address(rthread, JavaThread::popframe_condition_offset()));
+
+    // Continue in deoptimization handler
+    __ b(lr);
+
+    __ bind(caller_not_deoptimized);
+  }
+
+  __ remove_activation(vtos,
+                       /* throw_monitor_exception */ false,
+                       /* install_monitor_exception */ false,
+                       /* notify_jvmdi */ false);
+
+  // Restore the last_sp and null it out
+  __ ldr(sp, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  __ mov(rscratch1, 0);
+  __ str(rscratch1, Address(rfp, frame::get_interpreter_frame_last_sp_offset() * wordSize));
+  // remove_activation restores sp?
+
+  __ restore_bcp();
+  __ restore_locals();
+  __ restore_constant_pool_cache();
+  __ get_method(rmethod);
+  __ get_dispatch();
+
+  // The method data pointer was incremented already during
+  // call profiling. We have to restore the mdp for the current bcp.
+  if (ProfileInterpreter) {
+    __ set_method_data_pointer_for_bcp();
+  }
+
+  // Clear the popframe condition flag
+  __ mov(rscratch1, JavaThread::popframe_inactive);
+  __ str(rscratch1, Address(rthread, JavaThread::popframe_condition_offset()));
+  assert(JavaThread::popframe_inactive == 0, "fix popframe_inactive");
+
+#if INCLUDE_JVMTI
+  {
+    Label L_done;
+    __ ldrb(rscratch1, Address(rbcp, 0));
+    __ cmp(rscratch1, Bytecodes::_invokestatic);
+    __ b(L_done, Assembler::EQ);
+
+    // The member name argument must be restored if _invokestatic is re-executed after a PopFrame call.
+    // Detect such a case in the InterpreterRuntime function and return the member name argument, or NULL.
+
+    __ ldr(c_rarg0, Address(rlocals, 0));
+    __ call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::member_name_arg_or_null), c_rarg0, rmethod, rbcp);
+
+    __ cbz(r0, L_done);
+
+    __ str(r0, Address(sp, 0));
+    __ bind(L_done);
+  }
+#endif // INCLUDE_JVMTI
+
+  // Restore machine SP
+  /*__ ldr(rscratch1, Address(rmethod, Method::const_offset()));
+  __ ldrh(rscratch1, Address(rscratch1, ConstMethod::max_stack_offset()));
+  __ add(rscratch1, rscratch1, frame::interpreter_frame_monitor_size() + 4);
+  __ ldr(rscratch2,
+         Address(rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize));
+  __ sub(rscratch1, rscratch2, rscratch1, lsl(2));
+  __ bic(sp, rscratch1, 0xf);*/
+
+  __ dispatch_next(vtos);
+  // end of PopFrame support
+
+  Interpreter::_remove_activation_entry = __ pc();
+  __ print_method_exit(false);
+  __ reg_printf("remove_activation_entry\n");
+
+  // preserve exception over this code sequence
+  __ pop_ptr(r0);
+  __ str(r0, Address(rthread, JavaThread::vm_result_offset()));
+  // remove the activation (without doing throws on illegalMonitorExceptions)
+  __ remove_activation(vtos, false, true, false);
+  // restore exception
+  // restore exception
+  __ get_vm_result(r0, rthread);
+
+  // In between activations - previous activation type unknown yet
+  // compute continuation point - the continuation point expects the
+  // following registers set up:
+  //
+  // r0: exception
+  // lr: return address/pc that threw exception
+  // rsp: expression stack of caller
+  // rfp: fp of caller
+  // FIXME: There's no point saving LR here because VM calls don't trash it
+  __ strd(r0, lr, Address(__ pre(sp, -2 * wordSize)));  // save exception & return address
+  __ super_call_VM_leaf(CAST_FROM_FN_PTR(address,
+                          SharedRuntime::exception_handler_for_return_address),
+                        rthread, lr);
+  __ mov(r1, r0);                               // save exception handler
+  __ ldrd(r0, lr, Address(__ post(sp, 2 * wordSize)));  // restore exception & return address
+  // We might be returning to a deopt handler that expects r3 to
+  // contain the exception pc
+  __ mov(r3, lr);
+  // Note that an "issuing PC" is actually the next PC after the call
+  __ b(r1);                                    // jump to exception
+                                                // handler of caller
+}
+
+
+//
+// JVMTI ForceEarlyReturn support
+//
+address TemplateInterpreterGenerator::generate_earlyret_entry_for(TosState state) {
+  address entry = __ pc();
+  __ restore_bcp();
+  __ restore_locals();
+  __ empty_expression_stack();
+  __ load_earlyret_value(state);
+
+  __ ldr(rscratch1, Address(rthread, JavaThread::jvmti_thread_state_offset()));
+  Address cond_addr(rscratch1, JvmtiThreadState::earlyret_state_offset());
+
+  // Clear the earlyret state
+  assert(JvmtiThreadState::earlyret_inactive == 0, "should be");
+  __ mov(rscratch2, 0);
+  __ str(rscratch2, cond_addr);
+
+  __ remove_activation(state,
+                       false, /* throw_monitor_exception */
+                       false, /* install_monitor_exception */
+                       true); /* notify_jvmdi */
+  __ b(lr);
+
+  return entry;
+} // end of ForceEarlyReturn support
+
+
+
+//-----------------------------------------------------------------------------
+// Helper for vtos entry point generation
+
+void TemplateInterpreterGenerator::set_vtos_entry_points(Template* t,
+                                                         address& bep,
+                                                         address& cep,
+                                                         address& sep,
+                                                         address& aep,
+                                                         address& iep,
+                                                         address& lep,
+                                                         address& fep,
+                                                         address& dep,
+                                                         address& vep) {
+  assert(t->is_valid() && t->tos_in() == vtos, "illegal template");
+  Label L;
+  aep = __ pc();  __ push_ptr();  __ b(L);
+  dep = __ pc();
+  if(hasFPU()){
+    __ push_d(); __ b(L);
+  }
+  lep = __ pc();  __ push_l();    __ b(L);
+  fep = __ pc();
+  if(hasFPU()){
+    __ push_f();    __ b(L);
+  }
+  bep = cep = sep =
+  iep = __ pc();  __ push_i();
+  vep = __ pc();
+  __ bind(L);
+  generate_and_dispatch(t);
+}
+
+//-----------------------------------------------------------------------------
+
+// Non-product code
+#ifndef PRODUCT
+address TemplateInterpreterGenerator::generate_trace_code(TosState state) {
+  address entry = __ pc();
+
+  __ push(state);
+  // Save all registers on stack, so omit SP and PC
+  const RegSet push_set = RegSet::range(r0, r12) + lr;
+  const int push_set_cnt = __builtin_popcount(push_set.bits());
+  __ push(push_set, sp);
+  __ ldr(c_rarg2, Address(sp, push_set_cnt*wordSize));      // Pass top of stack
+  __ ldr(c_rarg3, Address(sp, (push_set_cnt+1)*wordSize));  // Pass top of stack high part/2nd stack word
+  __ call_VM(noreg,
+  //TODO: XXX: moved from SharedRuntime to InterpreterRuntime
+             CAST_FROM_FN_PTR(address, InterpreterRuntime::trace_bytecode),
+             c_rarg1, c_rarg2, c_rarg3);
+  __ pop(RegSet::range(r0, r12) + lr, sp);
+  __ pop(state);
+  __ b(lr);                                   // return from result handler
+
+  return entry;
+}
+
+void TemplateInterpreterGenerator::count_bytecode() {
+  __ push(c_rarg0);
+  __ push(rscratch1);
+  __ push(rscratch2);
+  Label L;
+  __ mov(rscratch2, (address) &BytecodeCounter::_counter_value);
+  __ bind(L);
+  __ ldrex(rscratch1, rscratch2);
+  __ add(rscratch1, rscratch1, 1);
+  // strex stores 2nd arg to dest adressed by 3rd arg,
+  // stores status to 1st arg. So, 1st and 2nd shoud be different.
+  __ strex(c_rarg0, rscratch1, rscratch2);
+  __ cmp(c_rarg0, 0);
+  __ b(L, Assembler::NE);
+  __ pop(rscratch2);
+  __ pop(rscratch1);
+  __ pop(c_rarg0);
+}
+
+void TemplateInterpreterGenerator::histogram_bytecode(Template* t) { ; }
+
+void TemplateInterpreterGenerator::histogram_bytecode_pair(Template* t) { ; }
+
+
+void TemplateInterpreterGenerator::trace_bytecode(Template* t) {
+  // Call a little run-time stub to avoid blow-up for each bytecode.
+  // The run-time runtime saves the right registers, depending on
+  // the tosca in-state for the given template.
+
+  assert(Interpreter::trace_code(t->tos_in()) != NULL,
+         "entry must have been generated");
+  __ bl(Interpreter::trace_code(t->tos_in()));
+}
+
+
+void TemplateInterpreterGenerator::stop_interpreter_at() {
+  Label L;
+  __ push(rscratch1);
+  __ mov(rscratch1, (address) &BytecodeCounter::_counter_value);
+  __ ldr(rscratch1, Address(rscratch1));
+  __ mov(rscratch2, StopInterpreterAt);
+  __ cmp(rscratch1, rscratch2);
+  __ b(L, Assembler::NE);
+  __ bkpt(0);
+  __ bind(L);
+  __ pop(rscratch1);
+}
+
+#endif // !PRODUCT
--- /dev/null	2018-09-25 19:25:29.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/templateTable_aarch32.cpp	2018-09-25 19:25:29.000000000 +0300
@@ -0,0 +1,4437 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "gc/shared/barrierSetAssembler.hpp"
+#include "interp_masm_aarch32.hpp"
+#include "interpreter/interpreter.hpp"
+#include "interpreter/interpreterRuntime.hpp"
+#include "interpreter/templateTable.hpp"
+#include "memory/universe.hpp"
+#include "oops/method.hpp"
+#include "oops/methodData.hpp"
+#include "oops/objArrayKlass.hpp"
+#include "oops/oop.inline.hpp"
+#include "prims/methodHandles.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/synchronizer.hpp"
+#include "vm_version_aarch32.hpp"
+
+#define __ _masm->
+
+// Platform-dependent initialization
+
+extern void aarch32TestHook();
+
+void TemplateTable::pd_initialize() {
+  aarch32TestHook();
+}
+
+// Address computation: local variables
+
+static inline Address iaddress(int n) {
+  return Address(rlocals, Interpreter::local_offset_in_bytes(n));
+}
+
+static inline Address laddress(int n) {
+  return iaddress(n + 1);
+}
+
+static inline Address faddress(int n) {
+  return iaddress(n);
+}
+
+static inline Address daddress(int n) {
+  return laddress(n);
+}
+
+static inline Address aaddress(int n) {
+  return iaddress(n);
+}
+
+static inline Address iaddress(Register r) {
+  return Address(rlocals, r, lsl(2));
+}
+
+// Note these two are different as VLDR/VSTR don't
+// support base + (offset{ << x })
+static inline Address faddress(Register r, Register scratch,
+                               InterpreterMacroAssembler* _masm) {
+  __ lea(scratch, Address(rlocals, r, lsl(2)));
+  return Address(scratch);
+}
+
+static inline Address daddress(Register r, Register scratch,
+                               InterpreterMacroAssembler* _masm) {
+  __ lea(scratch, Address(rlocals, r, lsl(2)));
+  return Address(scratch, Interpreter::local_offset_in_bytes(1));
+}
+
+static inline Address laddress(Register r, Register scratch,
+                               InterpreterMacroAssembler * _masm) {
+  return daddress(r, scratch, _masm);
+}
+
+static inline Address aaddress(Register r) {
+  return iaddress(r);
+}
+
+static inline Address at_rsp() {
+  return Address(sp, 0);
+}
+
+// At top of Java expression stack which may be different than sp().  It
+// isn't for category 1 objects.
+static inline Address at_tos   () {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(0));
+}
+
+static inline Address at_tos_p1() {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(1));
+}
+
+static inline Address at_tos_p2() {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(2));
+}
+
+static inline Address at_tos_p3() {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(3));
+}
+
+static inline Address at_tos_p4() {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(4));
+}
+
+static inline Address at_tos_p5() {
+  return Address(sp,  Interpreter::expr_offset_in_bytes(5));
+}
+
+// Condition conversion
+static Assembler::Condition j_not(TemplateTable::Condition cc) {
+  switch (cc) {
+  case TemplateTable::equal        : return Assembler::NE;
+  case TemplateTable::not_equal    : return Assembler::EQ;
+  case TemplateTable::less         : return Assembler::GE;
+  case TemplateTable::less_equal   : return Assembler::GT;
+  case TemplateTable::greater      : return Assembler::LE;
+  case TemplateTable::greater_equal: return Assembler::LT;
+  }
+  ShouldNotReachHere();
+  return Assembler::EQ;
+}
+
+
+// Miscelaneous helper routines
+// Store an oop (or NULL) at the Address described by obj.
+// If val == noreg this means store a NULL
+static void do_oop_store(InterpreterMacroAssembler* _masm,
+                         Address dst,
+                         Register val,
+                         DecoratorSet decorators) {
+  assert(val == noreg || val == r0, "parameter is just for looks");
+  assert(!dst.uses(r1) && !dst.uses(r14), "destroyed register");
+  __ store_heap_oop(dst, val, r14, r1, decorators);
+}
+
+static void do_oop_load(InterpreterMacroAssembler* _masm,
+                        Address src,
+                        Register dst,
+                        DecoratorSet decorators) {
+  __ load_heap_oop(dst, src, r14, r1, decorators);
+}
+
+Address TemplateTable::at_bcp(int offset) {
+  assert(_desc->uses_bcp(), "inconsistent uses_bcp information");
+  return Address(rbcp, offset);
+}
+
+void TemplateTable::patch_bytecode(Bytecodes::Code bc, Register bc_reg,
+                                   Register temp_reg, bool load_bc_into_bc_reg/*=true*/,
+                                   int byte_no)
+{
+  if (!RewriteBytecodes)  return;
+  Label L_patch_done;
+
+  switch (bc) {
+  case Bytecodes::_fast_aputfield:
+  case Bytecodes::_fast_bputfield:
+  case Bytecodes::_fast_zputfield:
+  case Bytecodes::_fast_cputfield:
+  case Bytecodes::_fast_dputfield:
+  case Bytecodes::_fast_fputfield:
+  case Bytecodes::_fast_iputfield:
+  case Bytecodes::_fast_lputfield:
+  case Bytecodes::_fast_sputfield:
+    {
+      // We skip bytecode quickening for putfield instructions when
+      // the put_code written to the constant pool cache is zero.
+      // This is required so that every execution of this instruction
+      // calls out to InterpreterRuntime::resolve_get_put to do
+      // additional, required work.
+      assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+      assert(load_bc_into_bc_reg, "we use bc_reg as temp");
+      __ get_cache_and_index_and_bytecode_at_bcp(temp_reg, bc_reg, temp_reg, byte_no, 1);
+      __ mov(bc_reg, bc);
+      __ cmp(temp_reg, (unsigned) 0);
+      __ b(L_patch_done, Assembler::EQ);  // don't patch
+    }
+    break;
+  default:
+    assert(byte_no == -1, "sanity");
+    // the pair bytecodes have already done the load.
+    if (load_bc_into_bc_reg) {
+      __ mov(bc_reg, bc);
+    }
+  }
+
+  if (JvmtiExport::can_post_breakpoint()) {
+    Label L_fast_patch;
+    // if a breakpoint is present we can't rewrite the stream directly
+    __ load_unsigned_byte(temp_reg, at_bcp(0));
+    __ cmp(temp_reg, Bytecodes::_breakpoint);
+    __ b(L_fast_patch, Assembler::NE);
+    // Let breakpoint table handling rewrite to quicker bytecode
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::set_original_bytecode_at), rmethod, rbcp, bc_reg);
+    __ b(L_patch_done);
+    __ bind(L_fast_patch);
+  }
+
+#ifdef ASSERT
+  Label L_okay;
+  __ load_unsigned_byte(temp_reg, at_bcp(0));
+  __ cmp(temp_reg, (int) Bytecodes::java_code(bc));
+  __ b(L_okay, Assembler::EQ);
+  __ cmp(temp_reg, bc_reg);
+  __ b(L_okay, Assembler::EQ);
+  __ stop("patching the wrong bytecode");
+  __ bind(L_okay);
+#endif
+
+  // patch bytecode
+  __ strb(bc_reg, at_bcp(0));
+  __ bind(L_patch_done);
+}
+
+
+// Individual instructions
+
+void TemplateTable::nop() {
+  transition(vtos, vtos);
+  // nothing to do
+}
+
+void TemplateTable::shouldnotreachhere() {
+  transition(vtos, vtos);
+  __ stop("shouldnotreachhere bytecode");
+}
+
+void TemplateTable::aconst_null()
+{
+  transition(vtos, atos);
+  __ mov(r0, 0);
+}
+
+void TemplateTable::iconst(int value)
+{
+  transition(vtos, itos);
+  __ mov(r0, value);
+}
+
+void TemplateTable::lconst(int value)
+{
+  // int is 32 bit and only ever used for loading small values
+  __ mov(r0, value & 0xffffffff);
+  __ mov(r1, 0);
+}
+
+void TemplateTable::fconst(int value)
+{
+  transition(vtos, ftos);
+  float fval = value;
+  assert(value == 0 || value == 1 || value == 2, "invalid float const");
+  if (hasFPU()) {
+    if(__ operand_valid_for_float_immediate(fval)) {
+      __ vmov_f32(d0, fval);
+    } else {
+      __ mov(r0, *((uint32_t*)&fval));
+      __ vmov_f32(d0, r0);
+    }
+  } else {
+    __ mov(r0, *((uint32_t*)&fval));
+  }
+}
+
+void TemplateTable::dconst(int value)
+{
+  transition(vtos, dtos);
+  double dval = value;
+  assert(value == 0 || value == 1 || value == 2, "invalid double const");
+  if (hasFPU()) {
+    if(__ operand_valid_for_double_immediate(dval)) {
+      __ vmov_f64(d0, dval);
+    } else {
+      uint32_t* ptr = (uint32_t*)&dval;
+      __ mov(r0, *ptr);
+      __ mov(r1, *(ptr + 1));
+      __ vmov_f64(d0, r0, r1);
+    }
+  } else {
+    uint32_t* ptr = (uint32_t*)&dval;
+    __ mov(r0, *ptr);
+    __ mov(r1, *(ptr + 1));
+  }
+}
+
+void TemplateTable::bipush()
+{
+  transition(vtos, itos);
+  __ load_signed_byte(r0, at_bcp(1));
+}
+
+void TemplateTable::sipush()
+{
+  transition(vtos, itos);
+  __ load_unsigned_short(r0, at_bcp(1));
+  __ rev(r0, r0);
+  __ asr(r0, r0, 16);
+}
+
+void TemplateTable::ldc(bool wide)
+{
+  transition(vtos, vtos);
+  Label call_ldc, notFloat, notClass, notInt, Done;
+
+  if (wide) {
+    __ get_unsigned_2_byte_index_at_bcp(r1, 1);
+  } else {
+    __ load_unsigned_byte(r1, at_bcp(1));
+  }
+  __ get_cpool_and_tags(r2, r0);
+
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  // get type
+  __ add(r3, r1, tags_offset);
+  __ ldrb(r3, Address(r0, r3));
+
+  // unresolved class - get the resolved class
+  __ cmp(r3, JVM_CONSTANT_UnresolvedClass);
+  __ b(call_ldc, Assembler::EQ);
+
+  // unresolved class in error state - call into runtime to throw the error
+  // from the first resolution attempt
+  __ cmp(r3, JVM_CONSTANT_UnresolvedClassInError);
+  __ b(call_ldc, Assembler::EQ);
+
+  // resolved class - need to call vm to get java mirror of the class
+  __ cmp(r3, JVM_CONSTANT_Class);
+  __ b(notClass, Assembler::NE);
+
+  __ bind(call_ldc);
+  __ mov(c_rarg1, wide);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::ldc), c_rarg1);
+  __ push_ptr(r0);
+  __ verify_oop(r0);
+  __ b(Done);
+
+  __ bind(notClass);
+  if (hasFPU()) {
+    __ cmp(r3, JVM_CONSTANT_Float);
+    __ b(notFloat, Assembler::NE);
+    // ftos
+    __ adds(r1, r2, r1, lsl(2));
+    __ vldr_f32(d0, Address(r1, base_offset));
+
+    __ push_f();
+
+    __ b(Done);
+
+    __ bind(notFloat);
+  } else {
+        // Soft FP pass through T_INT case.
+#ifdef ASSERT
+        __ cmp(r3, JVM_CONSTANT_Float);
+        __ mov(r3, JVM_CONSTANT_Integer,  Assembler::EQ);
+#endif // ASSER
+  }
+
+  __ cmp(r3, JVM_CONSTANT_Integer);
+  __ b(notInt, Assembler::NE);
+
+  // itos
+  __ adds(r1, r2, r1, lsl(2));
+  __ ldr(r0, Address(r1, base_offset));
+  __ push_i(r0);
+  __ b(Done);
+
+  __ bind(notInt);
+  condy_helper(Done);
+
+  __ bind(Done);
+}
+
+// Fast path for caching oop constants.
+void TemplateTable::fast_aldc(bool wide)
+{
+  transition(vtos, atos);
+
+  Register result = r0;
+  Register tmp = r1;
+  Register rarg = r2;
+
+  int index_size = wide ? sizeof(u2) : sizeof(u1);
+
+  Label resolved;
+
+  // We are resolved if the resolved reference cache entry contains a
+  // non-null object (String, MethodType, etc.)
+  assert_different_registers(result, tmp);
+  __ get_cache_index_at_bcp(tmp, 1, index_size);
+  __ load_resolved_reference_at_index(result, tmp);
+  __ cbnz(result, resolved);
+
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_ldc);
+
+  // first time invocation - must resolve first
+  __ mov(rarg, (int)bytecode());
+  __ call_VM(result, entry, rarg);
+
+  __ bind(resolved);
+
+  { // Check for the null sentinel.
+    // If we just called the VM, it already did the mapping for us,
+    // but it's harmless to retry.
+
+    // Stash null_sentinel address to get its value later
+    __ movptr(rarg, (uintptr_t)Universe::the_null_sentinel_addr());
+    __ ldr(tmp, Address(rarg));
+    __ cmp(result, tmp);
+    __ mov(result, 0, Assembler::EQ);  // NULL object reference
+  }
+
+  if (VerifyOops) {
+    // Safe to call with 0 result
+    __ verify_oop(result);
+  }
+}
+
+void TemplateTable::ldc2_w()
+{
+  transition(vtos, vtos);
+  Label notLong, Done;
+  __ get_unsigned_2_byte_index_at_bcp(r0, 1);
+
+  __ get_cpool_and_tags(r1, r2);
+  const int base_offset = ConstantPool::header_size() * wordSize;
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+
+  // get type
+  __ lea(r2, Address(r2, r0, lsl(0)));
+  __ load_unsigned_byte(r2, Address(r2, tags_offset));
+  if (hasFPU()) {
+    Label notDouble;
+    __ cmp(r2, (int)JVM_CONSTANT_Double);
+    __ b(notDouble, Assembler::NE);
+    // dtos
+    __ lea (r2, Address(r1, r0, lsl(2)));
+    __ vldr_f64(d0, Address(r2, base_offset));
+    __ push_d();
+    __ b(Done);
+
+    __ bind(notDouble);
+  }
+  __ cmp(r2, (int)JVM_CONSTANT_Long);
+  __ b(notLong, Assembler::NE);
+  // ltos
+  __ lea(r1, Address(r1, r0, lsl(2)));
+  __ ldr(r0, Address(r1, base_offset));
+  __ ldr(r1, Address(r1, base_offset + wordSize));
+  __ push_l();
+  __ b(Done);
+
+  __ bind(notLong);
+  condy_helper(Done);
+  __ bind(Done);
+}
+
+void TemplateTable::condy_helper(Label& Done)
+{
+  Register obj = r0;
+  Register rarg = r1;
+  Register flags = r2;
+  Register off = r3;
+
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_ldc);
+
+  __ mov(rarg, (int) bytecode());
+  __ call_VM(obj, entry, rarg);
+
+  __ get_vm_result_2(flags, rthread);
+
+  // VMr = obj = base address to find primitive value to push
+  // VMr2 = flags = (tos, off) using format of CPCE::_flags
+  assert(ConstantPoolCacheEntry::field_index_mask == right_n_bits(ConstantPoolCacheEntry::field_index_bits),
+          "fix the next line");
+  __ ubfx(off, flags, 0, ConstantPoolCacheEntry::field_index_bits);
+
+  const Address field(obj, off);
+
+  // What sort of thing are we loading?
+  __ ubfx(flags, flags, ConstantPoolCacheEntry::tos_state_shift,
+           ConstantPoolCacheEntry::tos_state_bits);
+
+  switch (bytecode()) {
+    case Bytecodes::_ldc:
+    case Bytecodes::_ldc_w:
+      {
+        // tos in (itos, ftos, stos, btos, ctos, ztos)
+        Label notInt, notFloat, notShort, notByte, notChar, notBool;
+        __ cmp(flags, itos);
+        __ b(notInt, Assembler::NE);
+        // itos
+        __ ldr(r0, field);
+        __ push(itos);
+        __ b(Done);
+
+        __ bind(notInt);
+        __ cmp(flags, ftos);
+        __ b(notFloat, Assembler::NE);
+        // ftos
+        __ lea(rarg, field); // vldr does not accept [r+r] address format
+        __ load_float(Address(rarg));
+        __ push(ftos);
+        __ b(Done);
+
+        __ bind(notFloat);
+        __ cmp(flags, stos);
+        __ b(notShort, Assembler::NE);
+        // stos
+        __ load_signed_short(r0, field);
+        __ push(stos);
+        __ b(Done);
+
+        __ bind(notShort);
+        __ cmp(flags, btos);
+        __ b(notByte, Assembler::NE);
+        // btos
+        __ load_signed_byte(r0, field);
+        __ push(btos);
+        __ b(Done);
+
+        __ bind(notByte);
+        __ cmp(flags, ctos);
+        __ b(notChar, Assembler::NE);
+        // ctos
+        __ load_unsigned_short(r0, field);
+        __ push(ctos);
+        __ b(Done);
+
+        __ bind(notChar);
+        __ cmp(flags, ztos);
+        __ b(notBool, Assembler::NE);
+        // ztos
+        __ load_signed_byte(r0, field);
+        __ push(ztos);
+        __ b(Done);
+
+        __ bind(notBool);
+        break;
+      }
+
+    case Bytecodes::_ldc2_w:
+      {
+        Label notLong, notDouble;
+        __ cmp(flags, ltos);
+        __ b(notLong, Assembler::NE);
+        // ltos
+        __ ldrd(r0, r1, field);
+        __ push(ltos);
+        __ b(Done);
+
+        __ bind(notLong);
+        __ cmp(flags, dtos);
+        __ b(notDouble, Assembler::NE);
+        // dtos
+        __ lea(rarg, field); // vdlr does not accept [r+r] address format
+        __ load_double(Address(rarg));
+        __ push(dtos);
+        __ b(Done);
+
+       __ bind(notDouble);
+        break;
+      }
+
+    default:
+      ShouldNotReachHere();
+    }
+
+    __ stop("bad ldc/condy");
+}
+
+void TemplateTable::locals_index(Register reg, int offset)
+{
+  __ ldrb(reg, at_bcp(offset));
+  __ neg(reg, reg);
+}
+
+void TemplateTable::iload() {
+  iload_internal();
+}
+
+void TemplateTable::nofast_iload() {
+  iload_internal(may_not_rewrite);
+}
+
+void TemplateTable::iload_internal(RewriteControl rc) {
+  transition(vtos, itos);
+  if (RewriteFrequentPairs && rc == may_rewrite) {
+    Label rewrite, done;
+    Register bc = r2;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_iload)));
+
+    // if _iload, wait to rewrite to iload2.  We only want to rewrite the
+    // last two iloads in a pair.  Comparing against fast_iload means that
+    // the next bytecode is neither an iload or a caload, and therefore
+    // an iload pair.
+    __ cmp(r1, Bytecodes::_iload);
+    __ b(done, Assembler::EQ);
+
+    // if _fast_iload rewrite to _fast_iload2
+    __ cmp(r1, Bytecodes::_fast_iload);
+    __ mov(bc, Bytecodes::_fast_iload2);
+    __ b(rewrite, Assembler::EQ);
+
+    // if _caload rewrite to _fast_icaload
+    __ cmp(r1, Bytecodes::_caload);
+    __ mov(bc, Bytecodes::_fast_icaload);
+    __ b(rewrite, Assembler::EQ);
+
+    // else rewrite to _fast_iload
+    __ mov(bc, Bytecodes::_fast_iload);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_iload, bc, r1, false);
+    __ bind(done);
+
+  }
+
+  // do iload, get the local value into tos
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+  __ reg_printf("iloaded value %d\n", r0);
+}
+
+void TemplateTable::fast_iload2()
+{
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+  __ push(itos);
+  locals_index(r1, 3);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::fast_iload()
+{
+  transition(vtos, itos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::lload()
+{
+  transition(vtos, ltos);
+  locals_index(r2);
+  __ ldrd(r0, r1, laddress(r2, r3, _masm));
+}
+
+void TemplateTable::fload()
+{
+  transition(vtos, ftos);
+  locals_index(r1);
+  __ load_float(faddress(r1, r2, _masm));
+}
+
+void TemplateTable::dload()
+{
+  transition(vtos, dtos);
+  if (hasFPU()) {
+    __ ldrb(r1, at_bcp(1));
+    __ sub(r1, rlocals, r1, lsl(LogBytesPerWord));
+    __ load_double(Address(r1, Interpreter::local_offset_in_bytes(1)));
+  } else {
+    locals_index(r2);
+    __ load_double(daddress(r2, r3, _masm));
+  }
+}
+
+void TemplateTable::aload()
+{
+  transition(vtos, atos);
+  locals_index(r1);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::locals_index_wide(Register reg) {
+  __ ldrh(reg, at_bcp(2));
+  __ rev16(reg, reg);
+  __ neg(reg, reg);
+}
+
+void TemplateTable::wide_iload() {
+  transition(vtos, itos);
+  locals_index_wide(r1);
+  __ ldr(r0, iaddress(r1));
+}
+
+void TemplateTable::wide_lload()
+{
+  transition(vtos, ltos);
+  locals_index_wide(r2);
+  __ ldrd(r0, r1, laddress(r2, r3, _masm));
+}
+
+void TemplateTable::wide_fload()
+{
+  transition(vtos, ftos);
+  locals_index_wide(r1);
+  if (hasFPU()) {
+      __ vldr_f32(d0, faddress(r1, rscratch1, _masm));
+  } else {
+  __ ldr (r0, faddress(r1, rscratch1, _masm));
+  }
+}
+
+void TemplateTable::wide_dload()
+{
+  transition(vtos, dtos);
+  if (hasFPU()) {
+    __ ldrh(r1, at_bcp(2));
+    __ rev16(r1, r1);
+    __ sub(r1, rlocals, r1, lsl(LogBytesPerWord));
+    __ vldr_f64(d0, Address(r1, Interpreter::local_offset_in_bytes(1)));
+  } else {
+    locals_index_wide(r2);
+    __ ldrd(r0, r1, daddress(r2, r3, _masm));
+  }
+}
+
+void TemplateTable::wide_aload()
+{
+  transition(vtos, atos);
+  locals_index_wide(r1);
+  __ ldr(r0, aaddress(r1));
+}
+
+void TemplateTable::index_check(Register array, Register index)
+{
+  // destroys rscratch1
+  // check array
+  __ null_check(array, arrayOopDesc::length_offset_in_bytes());
+  // sign extend index for use by indexed load
+  // __ movl2ptr(index, index);
+  // check index
+  Register length = rscratch1;
+  __ ldr(length, Address(array, arrayOopDesc::length_offset_in_bytes()));
+  __ reg_printf("Checking index in array, array = %p, alen = %d, index = %d\n", array, length, index);
+  __ cmp(index, length);
+  if (index != r2) {
+    // ??? convention: move aberrant index into r2 for exception message
+    assert(r2 != array, "different registers");
+    __ mov(r2, index);
+  }
+  Label ok;
+  __ b(ok, Assembler::LO);
+  // ??? convention: move array into r3 for exception message
+  __ mov(r3, array);
+  __ mov(rscratch1, Interpreter::_throw_ArrayIndexOutOfBoundsException_entry);
+  __ b(rscratch1);
+  __ bind(ok);
+}
+
+void TemplateTable::iaload()
+{
+  transition(itos, itos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2, Address(r0, r2, lsl(2)));
+  __ access_load_tos_at(T_INT, IN_HEAP | IS_ARRAY, Address(r2, arrayOopDesc::base_offset_in_bytes(T_INT)), noreg, noreg);
+}
+
+void TemplateTable::laload()
+{
+  transition(itos, ltos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2, Address(r0, r2, lsl(3)));
+  __ lea(r2, Address(r2,  arrayOopDesc::base_offset_in_bytes(T_LONG)));
+  __ atomic_ldrd(r0, r1, r2);
+  __ access_load_tos_at(T_LONG, IN_HEAP | IS_ARRAY, Address(r2), noreg, noreg);
+}
+
+void TemplateTable::faload()
+{
+  transition(itos, ftos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(2)));
+  __ access_load_tos_at(T_FLOAT, IN_HEAP | IS_ARRAY,
+          Address(r2, arrayOopDesc::base_offset_in_bytes(T_FLOAT)), noreg, noreg);
+}
+
+void TemplateTable::daload()
+{
+  transition(itos, dtos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(3)));
+  __ access_load_tos_at(T_DOUBLE, IN_HEAP | IS_ARRAY,
+          Address(r2, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)), noreg, noreg);
+}
+
+void TemplateTable::aaload()
+{
+  transition(itos, atos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2, Address(r0, r2, lsl(2)));
+  do_oop_load(_masm,
+              Address(r2, arrayOopDesc::base_offset_in_bytes(T_OBJECT)),
+              r0,
+              IS_ARRAY);
+}
+
+void TemplateTable::baload()
+{
+  transition(itos, itos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(0)));
+  __ access_load_tos_at(T_BYTE, IN_HEAP | IS_ARRAY, Address(r2, arrayOopDesc::base_offset_in_bytes(T_BYTE)), noreg, noreg);
+}
+
+void TemplateTable::caload()
+{
+  transition(itos, itos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(1)));
+  __ access_load_tos_at(T_CHAR, IN_HEAP | IS_ARRAY, Address(r2, arrayOopDesc::base_offset_in_bytes(T_CHAR)), noreg, noreg);
+}
+
+// iload followed by caload frequent pair
+void TemplateTable::fast_icaload()
+{
+  transition(vtos, itos);
+  // load index out of locals
+  locals_index(r2);
+  __ ldr(r2, iaddress(r2));
+
+  __ pop_ptr(r0);
+
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r1, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(1)));
+  __ access_load_tos_at(T_CHAR, IN_HEAP | IS_ARRAY, Address(r2, arrayOopDesc::base_offset_in_bytes(T_CHAR)), noreg, noreg);
+}
+
+void TemplateTable::saload()
+{
+  transition(itos, itos);
+  __ mov(r2, r0);
+  __ pop_ptr(r0);
+  // r0: array
+  // r2: index
+  index_check(r0, r2); // leaves index in r2, kills rscratch1
+  __ lea(r2,  Address(r0, r2, lsl(1)));
+  __ access_load_tos_at(T_SHORT, IN_HEAP | IS_ARRAY, Address(r2, arrayOopDesc::base_offset_in_bytes(T_SHORT)), noreg, noreg);
+}
+
+void TemplateTable::iload(int n)
+{
+  transition(vtos, itos);
+  __ ldr(r0, iaddress(n));
+}
+
+void TemplateTable::lload(int n)
+{
+  transition(vtos, ltos);
+  __ ldrd(r0, r1, laddress(n));
+}
+
+void TemplateTable::fload(int n)
+{
+  transition(vtos, ftos);
+  if (hasFPU()) {
+      __ vldr_f32(d0, faddress(n));
+  } else {
+    __ ldr(r0, faddress(n));
+  }
+}
+
+void TemplateTable::dload(int n)
+{
+  transition(vtos, dtos);
+  if (hasFPU()) {
+    __ vldr_f64(d0, daddress(n));
+  } else {
+    __ ldrd(r0, r1, daddress(n));
+  }
+}
+
+void TemplateTable::aload(int n)
+{
+  transition(vtos, atos);
+  __ ldr(r0, iaddress(n));
+  __ reg_printf("aload, loaded %p\n", r0);
+}
+
+void TemplateTable::aload_0() {
+  aload_0_internal();
+}
+
+void TemplateTable::nofast_aload_0() {
+  aload_0_internal(may_not_rewrite);
+}
+
+void TemplateTable::aload_0_internal(RewriteControl rc) {
+  // According to bytecode histograms, the pairs:
+  //
+  // _aload_0, _fast_igetfield
+  // _aload_0, _fast_agetfield
+  // _aload_0, _fast_fgetfield
+  //
+  // occur frequently. If RewriteFrequentPairs is set, the (slow)
+  // _aload_0 bytecode checks if the next bytecode is either
+  // _fast_igetfield, _fast_agetfield or _fast_fgetfield and then
+  // rewrites the current bytecode into a pair bytecode; otherwise it
+  // rewrites the current bytecode into _fast_aload_0 that doesn't do
+  // the pair check anymore.
+  //
+  // Note: If the next bytecode is _getfield, the rewrite must be
+  //       delayed, otherwise we may miss an opportunity for a pair.
+  //
+  // Also rewrite frequent pairs
+  //   aload_0, aload_1
+  //   aload_0, iload_1
+  // These bytecodes with a small amount of code are most profitable
+  // to rewrite
+  if (RewriteFrequentPairs && rc == may_rewrite) {
+    Label rewrite, done;
+    const Register bc = r14;
+
+    // get next bytecode
+    __ load_unsigned_byte(r1, at_bcp(Bytecodes::length_for(Bytecodes::_aload_0)));
+
+    // if _getfield then wait with rewrite
+    __ cmp(r1, Bytecodes::Bytecodes::_getfield);
+    __ b(done, Assembler::EQ);
+
+    // if _igetfield then rewrite to _fast_iaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_iaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmp(r1, Bytecodes::_fast_igetfield);
+    __ mov(bc, Bytecodes::_fast_iaccess_0);
+    __ b(rewrite, Assembler::EQ);
+
+    // if _agetfield then rewrite to _fast_aaccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aaccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmp(r1, Bytecodes::_fast_agetfield);
+    __ mov(bc, Bytecodes::_fast_aaccess_0);
+    __ b(rewrite, Assembler::EQ);
+
+    // if _fgetfield then rewrite to _fast_faccess_0
+    assert(Bytecodes::java_code(Bytecodes::_fast_faccess_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ cmp(r1, Bytecodes::_fast_fgetfield);
+    __ mov(bc, Bytecodes::_fast_faccess_0);
+    __ b(rewrite, Assembler::EQ);
+
+    // else rewrite to _fast_aload0
+    assert(Bytecodes::java_code(Bytecodes::_fast_aload_0) == Bytecodes::_aload_0, "fix bytecode definition");
+    __ mov(bc, Bytecodes::Bytecodes::_fast_aload_0);
+
+    // rewrite
+    // bc: new bytecode
+    __ bind(rewrite);
+    patch_bytecode(Bytecodes::_aload_0, bc, r1, false);
+
+    __ bind(done);
+  }
+
+  // Do actual aload_0 (must do this after patch_bytecode which might call VM and GC might change oop).
+  aload(0);
+}
+
+void TemplateTable::istore()
+{
+  transition(itos, vtos);
+  locals_index(r1);
+  __ lea(rscratch1, iaddress(r1));
+  __ str(r0, Address(rscratch1));
+}
+
+void TemplateTable::lstore()
+{
+  transition(ltos, vtos);
+  locals_index(r2);
+  __ strd(r0, r1, laddress(r2, r3, _masm));
+}
+
+void TemplateTable::fstore() {
+  transition(ftos, vtos);
+  locals_index(r1);
+  __ lea(rscratch1, iaddress(r1));
+  if (hasFPU()) {
+      __ vstr_f32(d0, Address(rscratch1));
+  } else {
+    __ str(r0, Address(rscratch1));
+  }
+}
+
+void TemplateTable::dstore() {
+  transition(dtos, vtos);
+  if (hasFPU()) {
+    locals_index(r1);
+    __ vstr_f64(d0, daddress(r1, rscratch1, _masm));
+  } else {
+    locals_index(r2);
+    __ strd(r0, r1, daddress(r2, rscratch1, _masm));
+  }
+}
+
+void TemplateTable::astore()
+{
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  __ reg_printf("Astore, storing value %p\n", r0);
+  locals_index(r1);
+  __ str(r0, aaddress(r1));
+}
+
+void TemplateTable::wide_istore() {
+  transition(vtos, vtos);
+  __ pop_i();
+  locals_index_wide(r1);
+  __ lea(rscratch1, iaddress(r1));
+  __ str(r0, Address(rscratch1));
+}
+
+void TemplateTable::wide_lstore() {
+  transition(vtos, vtos);
+  __ pop_l();
+  locals_index_wide(r2);
+  __ strd(r0, r1, laddress(r2, r3, _masm));
+}
+
+void TemplateTable::wide_fstore() {
+  transition(vtos, vtos);
+  locals_index_wide(r1);
+  __ lea(rscratch1, faddress(r1, rscratch1, _masm));
+  if (hasFPU()) {
+      __ pop_f();
+    __ vstr_f32(d0, rscratch1);
+  } else {
+    __ pop_i();
+    __ str(r0, Address(rscratch1));
+  }
+}
+
+void TemplateTable::wide_dstore() {
+  transition(vtos, vtos);
+  if (hasFPU()) {
+    __ pop_d();
+    locals_index_wide(r1);
+    __ vstr_f64(d0, daddress(r1, rscratch1, _masm));
+  } else {
+    __ pop_l();
+    locals_index_wide(r2);
+    __ strd(r0, r1, daddress(r2, rscratch1, _masm));
+  }
+}
+
+void TemplateTable::wide_astore() {
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  locals_index_wide(r1);
+  __ str(r0, aaddress(r1));
+}
+
+void TemplateTable::iastore() {
+  transition(itos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // r0: value
+  // r2: index
+  // r3: array
+  index_check(r3, r2); // prefer index in r2
+  __ lea(rscratch1, Address(r3, r2, lsl(2)));
+  __ access_store_tos_at(T_INT, IN_HEAP | IS_ARRAY,
+                         Address(rscratch1, arrayOopDesc::base_offset_in_bytes(T_INT)), noreg, noreg);
+}
+
+void TemplateTable::lastore() {
+  transition(ltos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // <r0:r1>: value
+  // r2: index
+  // r3: array
+  index_check(r3, r2); // prefer index in r2
+  __ lea(rscratch1, Address(r3, r2, lsl(3)));
+  __ lea(rscratch1, Address(rscratch1,
+                            arrayOopDesc::base_offset_in_bytes(T_LONG)));
+  __ access_store_tos_at(T_LONG, IN_HEAP | IS_ARRAY, Address(rscratch1), noreg, noreg);
+}
+
+void TemplateTable::fastore() {
+  transition(ftos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // d0/r0: value
+  // r2:  index
+  // r3:  array
+  index_check(r3, r2); // prefer index in r2
+  __ lea(rscratch1, Address(r3, r2, lsl(2)));
+  __ access_store_tos_at(T_FLOAT, IN_HEAP | IS_ARRAY,
+                         Address(rscratch1, arrayOopDesc::base_offset_in_bytes(T_FLOAT)), noreg, noreg);
+}
+
+void TemplateTable::dastore() {
+  transition(dtos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // d0/r0:r1: value
+  // r2:  index
+  // r3:  array
+  index_check(r3, r2); // prefer index in r2
+  __ lea(rscratch1, Address(r3, r2, lsl(3)));
+  __ access_store_tos_at(T_DOUBLE, IN_HEAP | IS_ARRAY,
+          Address(rscratch1, arrayOopDesc::base_offset_in_bytes(T_DOUBLE)), noreg, noreg);
+}
+
+void TemplateTable::aastore() {
+  Label is_null, ok_is_subtype, done;
+  transition(vtos, vtos);
+  // stack: ..., array, index, value
+  __ ldr(r0, at_tos());    // value
+  __ ldr(r2, at_tos_p1()); // index
+  __ ldr(r3, at_tos_p2()); // array
+
+  Address element_address(r2, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
+
+  index_check(r3, r2);
+
+  // do array store check - check for NULL value first
+  __ cmp(r0, 0);
+  __ b(is_null, Assembler::EQ);
+
+  // Move subklass into r1
+  __ load_klass(r1, r0);
+  // Move superklass into r0
+  __ load_klass(r0, r3);
+  __ ldr(r0, Address(r0,
+                     ObjArrayKlass::element_klass_offset()));
+  // Compress array + index*oopSize + 12 into a single register.  Frees r2.
+
+  // Generate subtype check.  Blows r2, r14?
+  // Superklass in r0.  Subklass in r1.
+  __ gen_subtype_check(r1, ok_is_subtype);
+
+  // Come here on failure
+  // object is at TOS
+  __ b(Interpreter::_throw_ArrayStoreException_entry);
+
+  // Come here on success
+  __ bind(ok_is_subtype);
+
+  // Get the value we will store
+  __ ldr(r0, at_tos());
+  // And the clobbered index
+  __ ldr(r2, at_tos_p1()); // index
+  __ lea(r2, Address(r3, r2, lsl(2)));
+  // Now store using the appropriate barrier
+
+  do_oop_store(_masm, element_address, r0, IS_ARRAY);
+  __ b(done);
+
+  // Have a NULL in r0, r3=array, r2=index.  Store NULL at ary[idx]
+  __ bind(is_null);
+  __ profile_null_seen(r1);
+
+  __ lea(r2, Address(r3, r2, lsl(2)));
+  // Store a NULL
+  do_oop_store(_masm, element_address, noreg, IS_ARRAY);
+
+  // Pop stack arguments
+  __ bind(done);
+  __ add(sp, sp, 3 * Interpreter::stackElementSize);
+}
+
+void TemplateTable::bastore()
+{
+  transition(itos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // r0: value
+  // r2: index
+  // r3: array
+  index_check(r3, r2); // prefer index in r2
+
+  // Need to check whether array is boolean or byte
+  // since both types share the bastore bytecode.
+  __ load_klass(r1, r3);
+  __ ldr(r1, Address(r1, Klass::layout_helper_offset()));
+  int diffbit = Klass::layout_helper_boolean_diffbit();
+  __ tst(r1, diffbit);
+  __ andr(r0, r0, 1, Assembler::NE);  // if it is a T_BOOLEAN array, mask the stored value to 0/1
+
+  __ lea(rscratch1, Address(r3, r2));
+  __ access_store_tos_at(T_BYTE, IN_HEAP | IS_ARRAY,
+                         Address(rscratch1, arrayOopDesc::base_offset_in_bytes(T_BYTE)), noreg, noreg);
+}
+
+void TemplateTable::castore()
+{
+  transition(itos, vtos);
+  __ pop_i(r2);
+  __ pop_ptr(r3);
+  // r0: value
+  // r2: index
+  // r3: array
+  index_check(r3, r2); // prefer index in r2
+  __ lea(rscratch1, Address(r3, r2, lsl(1)));
+  __ access_store_tos_at(T_CHAR, IN_HEAP | IS_ARRAY,
+                         Address(rscratch1, arrayOopDesc::base_offset_in_bytes(T_CHAR)), noreg, noreg);
+}
+
+void TemplateTable::sastore()
+{
+  castore();
+}
+
+void TemplateTable::istore(int n)
+{
+  transition(itos, vtos);
+  __ str(r0, iaddress(n));
+}
+
+void TemplateTable::lstore(int n)
+{
+  transition(ltos, vtos);
+  __ strd(r0, r1, laddress(n));
+}
+
+void TemplateTable::fstore(int n)
+{
+  transition(ftos, vtos);
+  if (hasFPU()) {
+      __ vstr_f32(d0, faddress(n));
+  } else {
+    __ str(r0, faddress(n));
+  }
+}
+
+void TemplateTable::dstore(int n)
+{
+  transition(dtos, vtos);
+  if (hasFPU()) {
+      __ vstr_f64(d0, daddress(n));
+  } else {
+    __ strd(r0, r1, daddress(n));
+  }
+}
+
+void TemplateTable::astore(int n)
+{
+  transition(vtos, vtos);
+  __ pop_ptr(r0);
+  __ str(r0, iaddress(n));
+}
+
+void TemplateTable::pop()
+{
+  transition(vtos, vtos);
+  __ add(sp, sp, Interpreter::stackElementSize);
+}
+
+void TemplateTable::pop2()
+{
+  transition(vtos, vtos);
+  __ add(sp, sp, 2 * Interpreter::stackElementSize);
+}
+
+void TemplateTable::dup()
+{
+  transition(vtos, vtos);
+  __ ldr(r0, Address(sp, 0));
+  __ reg_printf("Value duplicated is %p\n", r0);
+  __ push(r0);
+  // stack: ..., a, a
+}
+
+void TemplateTable::dup_x1()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r0, at_tos());  // load b
+  __ ldr(r2, at_tos_p1());  // load a
+  __ str(r0, at_tos_p1());  // store b
+  __ str(r2, at_tos());  // store a
+  __ push(r0);                  // push b
+  // stack: ..., b, a, b
+}
+
+void TemplateTable::dup_x2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c
+  __ ldr(r0, at_tos());  // load c
+  __ ldr(r2, at_tos_p2());  // load a
+  __ str(r0, at_tos_p2());  // store c in a
+  __ push(r0);      // push c
+  // stack: ..., c, b, c, c
+  __ ldr(r0, at_tos_p2());  // load b
+  __ str(r2, at_tos_p2());  // store a in b
+  // stack: ..., c, a, c, c
+  __ str(r0, at_tos_p1());  // store b in c
+  // stack: ..., c, a, b, c
+}
+
+void TemplateTable::dup2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r0, at_tos_p1());  // load a
+  __ push(r0);                  // push a
+  __ ldr(r0, at_tos_p1());  // load b
+  __ push(r0);                  // push b
+  // stack: ..., a, b, a, b
+}
+
+void TemplateTable::dup2_x1()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c
+  __ ldr(r2, at_tos());  // load c
+  __ ldr(r0, at_tos_p1());  // load b
+  __ push(r0);                  // push b
+  __ push(r2);                  // push c
+  // stack: ..., a, b, c, b, c
+  __ str(r2, at_tos_p3());  // store c in b
+  // stack: ..., a, c, c, b, c
+  __ ldr(r2, at_tos_p4());  // load a
+  __ str(r2, at_tos_p2());  // store a in 2nd c
+  // stack: ..., a, c, a, b, c
+  __ str(r0, at_tos_p4());  // store b in a
+  // stack: ..., b, c, a, b, c
+}
+
+void TemplateTable::dup2_x2()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b, c, d
+  __ ldr(r2, at_tos());  // load d
+  __ ldr(r0, at_tos_p1());  // load c
+  __ push(r0)            ;      // push c
+  __ push(r2);                  // push d
+  // stack: ..., a, b, c, d, c, d
+  __ ldr(r0, at_tos_p4());  // load b
+  __ str(r0, at_tos_p2());  // store b in d
+  __ str(r2, at_tos_p4());  // store d in b
+  // stack: ..., a, d, c, b, c, d
+  __ ldr(r2, at_tos_p5());  // load a
+  __ ldr(r0, at_tos_p3());  // load c
+  __ str(r2, at_tos_p3());  // store a in c
+  __ str(r0, at_tos_p5());  // store c in a
+  // stack: ..., c, d, a, b, c, d
+}
+
+void TemplateTable::swap()
+{
+  transition(vtos, vtos);
+  // stack: ..., a, b
+  __ ldr(r2, at_tos_p1());  // load a
+  __ ldr(r0, at_tos());  // load b
+  __ str(r2, at_tos());  // store a in b
+  __ str(r0, at_tos_p1());  // store b in a
+  // stack: ..., b, a
+}
+
+void TemplateTable::iop2(Operation op)
+{
+  transition(itos, itos);
+  // r0 <== r1 op r0
+  __ pop_i(r1);
+  switch (op) {
+  case add  : __ add(r0, r1, r0);  break;
+  case sub  : __ sub(r0, r1, r0);  break;
+  case mul  : __ mul(r0, r1, r0);  break;
+  case _and : __ andr(r0, r1, r0); break;
+  case _or  : __ orr(r0, r1, r0);  break;
+  case _xor : __ eor(r0, r1, r0);  break;
+  case shl  :
+      __ andr(r0, r0, 0x1f);
+      __ lsl(r0, r1, r0);
+      break;
+  case shr  :
+      __ andr(r0, r0, 0x1f);
+      __ asr(r0, r1, r0);
+      break;
+  case ushr :
+      __ andr(r0, r0, 0x1f);
+      __ lsr(r0, r1, r0);
+      break;
+  default   : ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::lop2(Operation op)
+{
+  transition(ltos, ltos);
+  // <r1:r0> <== <r3:r2> op <r1:r0>
+  __ pop_l(r2, r3);
+  switch (op) {
+  case add  : __ adds(r0, r2, r0); __ adc(r1, r3, r1);  break;
+  case sub  : __ subs(r0, r2, r0); __ sbc(r1, r3, r1);  break;
+  case mul  : __ mult_long(r0, r2, r0);                 break;
+  case _and : __ andr(r0, r2, r0); __ andr(r1, r3, r1); break;
+  case _or  : __ orr(r0, r2, r0);  __ orr(r1, r3, r1);  break;
+  case _xor : __ eor(r0, r2, r0);  __ eor(r1, r3, r1);  break;
+  default   : ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::idiv()
+{
+  transition(itos, itos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cmp(r0, 0);
+  __ b(no_div0, Assembler::NE);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ b(rscratch1);
+  __ bind(no_div0);
+  __ pop_i(r1);
+  // r0 <== r1 idiv r0
+  __ divide(r0, r1, r0, 32, false);
+}
+
+void TemplateTable::irem()
+{
+  transition(itos, itos);
+  // explicitly check for div0
+  Label no_div0;
+  __ cmp(r0, 0);
+  __ b(no_div0, Assembler::NE);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry);
+  __ b(rscratch1);
+  __ bind(no_div0);
+  __ pop_i(r1);
+  // r0 <== r1 irem r0
+  __ divide(r0, r1, r0, 32, true);
+}
+
+void TemplateTable::lmul()
+{
+  transition(ltos, ltos);
+  __ pop_l(r2, r3);
+  __ mult_long(r0, r0, r2);
+}
+
+void TemplateTable::ldiv()
+{
+  transition(ltos, ltos);
+  // explicitly check for div0
+  __ cmp(r0, 0);
+  __ cmp(r1, 0, Assembler::EQ);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry, Assembler::EQ);
+  __ b(rscratch1, Assembler::EQ);
+
+  __ pop_l(r2, r3);
+  // r0 <== r1 ldiv r0
+  __ divide(r0, r2, r0, 64, false);
+}
+
+void TemplateTable::lrem()
+{
+  transition(ltos, ltos);
+  // explicitly check for div0
+  __ cmp(r0, 0);
+  __ cmp(r1, 0, Assembler::EQ);
+  __ mov(rscratch1, Interpreter::_throw_ArithmeticException_entry, Assembler::EQ);
+  __ b(rscratch1, Assembler::EQ);
+
+  __ pop_l(r2, r3);
+  // r0 <== r1 lrem r0
+  __ divide(r0, r2, r0, 64, true);
+}
+
+void TemplateTable::lshl() {
+    transition(itos, ltos);
+    // shift count is in r0 - take shift from bottom six bits only
+    __ andr(r0, r0, 0x3f);
+    __ pop_l(r2, r3);
+    const int word_bits = 8 * wordSize;
+
+    __ sub(r1, r0, word_bits);
+    __ lsl(r3, r3, r0);
+    __ orr(r3, r3, r2, lsl(r1));
+    __ rsb(r1, r0, word_bits);
+    __ orr(r1, r3, r2, lsr(r1));
+    __ lsl(r0, r2, r0);
+}
+
+void TemplateTable::lshr() {
+    transition(itos, ltos);
+    // shift count is in r0 - take shift from bottom six bits only
+    __ andr(rscratch1, r0, 0x3f);
+    __ pop_l(r2, r3);
+    const int word_bits = 8 * wordSize;
+
+    __ lsr(r2, r2, rscratch1);
+    __ rsb(r1, rscratch1, word_bits);
+    __ orr(r0, r2, r3, lsl(r1));
+    __ asr(r1, r3, rscratch1);
+    __ subs(rscratch1, rscratch1, word_bits);
+    __ orr(r0, r2, r3, asr(rscratch1), Assembler::GT);
+}
+
+void TemplateTable::lushr() {
+    transition(itos, ltos);
+    // shift count is in r0 - take shift from bottom six bits only
+    __ andr(r0, r0, 0x3f);
+    __ pop_l(r2, r3);
+    const int word_bits = 8 * wordSize;
+
+    __ lsr(r2, r2, r0);
+    __ rsb(r1, r0, word_bits);
+    __ orr(r2, r2, r3, lsl(r1));
+    __ lsr(r1, r3, r0);
+    __ sub(r0, r0, word_bits);
+    __ orr(r0, r2, r3, lsr(r0));
+}
+
+void TemplateTable::fop2(Operation op)
+{
+  transition(ftos, ftos);
+  if(hasFPU()) {
+    switch (op) {
+    case add:
+      __ pop_f(d1);
+      __ vadd_f32(d0, d1, d0);
+      break;
+    case sub:
+      __ pop_f(d1);
+      __ vsub_f32(d0, d1, d0);
+      break;
+    case mul:
+      __ pop_f(d1);
+      __ vmul_f32(d0, d1, d0);
+      break;
+    case div:
+      __ pop_f(d1);
+      __ vdiv_f32(d0, d1, d0);
+      break;
+    case rem:
+      __ vmov_f32(f1, f0);
+      __ pop_f(f0);
+      #ifndef HARD_FLOAT_CC
+      __ vmov_f32(r0, f0);
+      __ vmov_f32(r1, f1);
+      #endif
+      __ mov(rscratch1, (address)fmodf);
+      __ bl(rscratch1);
+      #ifndef HARD_FLOAT_CC
+      __ vmov_f32(f0, r0);
+      #endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef __SOFTFP__
+    __ mov(r1, r0);
+    __ pop_i(r0);
+    switch (op) {
+    case add:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fadd), 0);
+      break;
+    case sub:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fsub), 0);
+      break;
+    case mul:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fmul), 0);
+      break;
+    case div:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fdiv), 0);
+      break;
+    case rem:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::frem), 0);
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  #else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+}
+}
+
+void TemplateTable::dop2(Operation op)
+{
+  transition(dtos, dtos);
+  if (hasFPU()) {
+    switch (op) {
+    case add:
+      __ pop_d(d1);
+      __ vadd_f64(d0, d1, d0);
+      break;
+    case sub:
+      __ pop_d(d1);
+      __ vsub_f64(d0, d1, d0);
+      break;
+    case mul:
+      __ pop_d(d1);
+      __ vmul_f64(d0, d1, d0);
+      break;
+    case div:
+      __ pop_d(d1);
+      __ vdiv_f64(d0, d1, d0);
+      break;
+    case rem:
+      __ vmov_f64(d1, d0);
+      __ pop_d(d0);
+      #ifndef HARD_FLOAT_CC
+      __ vmov_f64(r0, r1, d0);
+      __ vmov_f64(r2, r3, d1);
+      #endif
+      __ mov(rscratch1, (address)(double (*)(double, double))fmod);
+      __ bl(rscratch1);
+      #ifndef HARD_FLOAT_CC
+      __ vmov_f64(d0, r0, r1);
+      #endif
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+  } else {
+#ifdef __SOFTFP__
+    __ push_l(r0, r1);
+    __ pop_l(r2,r3);
+    __ pop_l(r0,r1);
+    switch (op) {
+    case add:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dadd), 0);
+      break;
+    case sub:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dsub), 0);
+      break;
+    case mul:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dmul), 0);
+      break;
+    case div:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::ddiv), 0);
+      break;
+    case rem:
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::drem), 0);
+      break;
+    default:
+      ShouldNotReachHere();
+      break;
+    }
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+  }
+}
+
+void TemplateTable::ineg()
+{
+  transition(itos, itos);
+  __ neg(r0, r0);
+
+}
+
+void TemplateTable::lneg()
+{
+  transition(ltos, ltos);
+  __ rsbs(r0, r0, 0);
+  __  rsc(r1, r1, 0);
+}
+
+void TemplateTable::fneg()
+{
+  transition(ftos, ftos);
+  if(hasFPU()) {
+      __ vneg_f32(d0, d0);
+  } else {
+#ifdef __SOFTFP__
+    __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fneg), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+  }
+}
+
+void TemplateTable::dneg()
+{
+  transition(dtos, dtos);
+  if(hasFPU()) {
+      __ vneg_f64(d0, d0);
+  } else {
+#ifdef __SOFTFP__
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dneg), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+  }
+}
+
+void TemplateTable::iinc()
+{
+  transition(vtos, vtos);
+  __ load_signed_byte(r1, at_bcp(2)); // get constant
+  locals_index(r2);
+  __ ldr(r0, iaddress(r2));
+  __ add(r0, r0, r1);
+  __ str(r0, iaddress(r2));
+}
+
+void TemplateTable::wide_iinc()
+{
+  transition(vtos, vtos);
+  __ ldr(r1, at_bcp(2)); // get constant and index
+  __ rev16(r1, r1);
+  __ uxth(r2, r1);
+  __ neg(r2, r2);
+  __ sxth(r1, r1, ror(16));
+  __ ldr(r0, iaddress(r2));
+  __ add(r0, r0, r1);
+  __ str(r0, iaddress(r2));
+}
+
+void TemplateTable::convert()
+{
+  // Checking
+#ifdef ASSERT
+  {
+    TosState tos_in  = ilgl;
+    TosState tos_out = ilgl;
+    switch (bytecode()) {
+    case Bytecodes::_i2l: // fall through
+    case Bytecodes::_i2f: // fall through
+    case Bytecodes::_i2d: // fall through
+    case Bytecodes::_i2b: // fall through
+    case Bytecodes::_i2c: // fall through
+    case Bytecodes::_i2s: tos_in = itos; break;
+    case Bytecodes::_l2i: // fall through
+    case Bytecodes::_l2f: // fall through
+    case Bytecodes::_l2d: tos_in = ltos; break;
+    case Bytecodes::_f2i: // fall through
+    case Bytecodes::_f2l: // fall through
+    case Bytecodes::_f2d: tos_in = ftos; break;
+    case Bytecodes::_d2i: // fall through
+    case Bytecodes::_d2l: // fall through
+    case Bytecodes::_d2f: tos_in = dtos; break;
+    default             : ShouldNotReachHere();
+    }
+    switch (bytecode()) {
+    case Bytecodes::_l2i: // fall through
+    case Bytecodes::_f2i: // fall through
+    case Bytecodes::_d2i: // fall through
+    case Bytecodes::_i2b: // fall through
+    case Bytecodes::_i2c: // fall through
+    case Bytecodes::_i2s: tos_out = itos; break;
+    case Bytecodes::_i2l: // fall through
+    case Bytecodes::_f2l: // fall through
+    case Bytecodes::_d2l: tos_out = ltos; break;
+    case Bytecodes::_i2f: // fall through
+    case Bytecodes::_l2f: // fall through
+    case Bytecodes::_d2f: tos_out = ftos; break;
+    case Bytecodes::_i2d: // fall through
+    case Bytecodes::_l2d: // fall through
+    case Bytecodes::_f2d: tos_out = dtos; break;
+    default             : ShouldNotReachHere();
+    }
+    transition(tos_in, tos_out);
+  }
+#endif // ASSERT
+  // static const int64_t is_nan = 0x8000000000000000L;
+  //TODO fix this and remove _ sxtw and _ uxtw as don't exist in arm32
+  // need to figure out about handling doubles and longs as they won't
+  // fit into a single register in arm32
+  // Conversion
+  switch (bytecode()) {
+  case Bytecodes::_i2l:
+    // __ sxtw(r0, r0);
+    __ reg_printf("Convert i2l (before) 0x00000000%08x\n", r0);
+    __ asr(r1, r0, 31);
+    __ reg_printf("Convert i2l (after) 0x%08x%08x\n", r1, r0);
+    break;
+  case Bytecodes::_i2f:
+    if(hasFPU()) {
+      __ vmov_f32(d0, r0);
+      __ vcvt_f32_s32(d0, d0);
+    } else {
+#ifdef __SOFTFP__
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::i2f), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+    }
+    break;
+  case Bytecodes::_i2d:
+    if(hasFPU()) {
+      //__ scvtfwd(d0, r0);
+      __ vmov_f32(d0, r0);
+      __ vcvt_f64_s32(d0, d0);
+    } else {
+#ifdef __SOFTFP__
+        // ro -> <r1:r0>
+      __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::i2d), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+    }
+    break;
+  case Bytecodes::_i2b:
+    __ sxtb(r0, r0);
+    break;
+  case Bytecodes::_i2c:
+    __ uxth(r0, r0);
+    break;
+  case Bytecodes::_i2s:
+    __ sxth(r0, r0);
+    break;
+  case Bytecodes::_l2i:
+    //__ uxtw(r0, r0);
+    break;
+  case Bytecodes::_l2f:
+    // <r1:r0> -> d0
+    // or <r1:r0> -> r0 for softfp
+    __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::l2f), 0);
+#ifndef HARD_FLOAT_CC
+    if(hasFPU()) {
+        __ vmov_f32(d0, r0);
+    }
+#endif
+    break;
+  case Bytecodes::_l2d:
+    // <r1:r0> -> d0
+    // or <r1:r0> -> <r1:r0> for softfp
+    __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::l2d), 0);
+#ifndef HARD_FLOAT_CC
+    if(hasFPU()) {
+        __ vmov_f64(d0, r0, r1);
+    }
+#endif
+    break;
+  case Bytecodes::_f2i:
+  {
+      if(hasFPU()) {
+        __ vcvt_s32_f32(d0, d0);
+        __ vmov_f32(r0, d0);
+      } else {
+        __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::f2i), 0);
+      }
+  }
+    break;
+  case Bytecodes::_f2l:
+  {
+#if !defined(HARD_FLOAT_CC)
+    //float already in d0 long goes to <r1:r0>
+    if(hasFPU()) {
+        //Need to move float in d0 to r0
+        __ vmov_f32(r0, d0);
+    }
+#endif //!defined(HARD_FLOAT_CC)
+    __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::f2l), 0);
+  }
+    break;
+  case Bytecodes::_f2d:
+    if(hasFPU()) {
+        __ vcvt_f64_f32(d0, d0);
+    } else {
+#ifdef __SOFTFP__
+        __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::f2d), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+    }
+    break;
+  case Bytecodes::_d2i:
+  {
+    if(hasFPU()) {
+        __ vcvt_s32_f64(d0, d0);
+        __ vmov_f32(r0, d0);
+    } else {
+#ifdef __SOFTFP__
+        __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::d2i), 0);
+#else
+        // expected -mfloat-abi=soft
+        ShouldNotReachHere();
+#endif
+    }
+  }
+    break;
+  case Bytecodes::_d2l:
+  {
+    // d0 -> <r1:r0>
+#if !defined(HARD_FLOAT_CC)
+    if(hasFPU()) {
+        //Need to move float in d0 to r0
+        __ vmov_f64(r0, r1, d0);
+    }
+#endif //!defined(HARD_FLOAT_CC)
+    __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::d2l), 0);
+  }
+    break;
+  case Bytecodes::_d2f:
+    if(hasFPU()) {
+        __ vcvt_f32_f64(d0, d0);
+    } else {
+#ifdef __SOFTFP__
+        __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::d2f), 0);
+#else
+      // expected -mfloat-abi=soft
+      ShouldNotReachHere();
+#endif
+    }
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+}
+
+void TemplateTable::lcmp()
+{
+  transition(ltos, itos);
+  __ pop_l(r2, r3);
+  // <r1:r0> == <r3:r2> : 0
+  // <r1:r0> < <r3:r2> : 1
+  // <r1:r0> > <r3:r2> : -1
+  __ reg_printf("Long comparing 0x%08x%08x\n", r1, r0);
+  __ reg_printf("           and 0x%08x%08x\n", r3, r2);
+  //cmp high
+  Label lower, end;
+  __ cmp(r3, r1);
+  __ b(lower, Assembler::EQ);
+  __ mov(r0, 1);
+  __ sub(r0, r0, 2, Assembler::LT);
+  __ b(end);
+
+  __ bind(lower);
+  __ subs(r0, r2, r0);
+  __ mov(r0, 1, Assembler::NE);
+  __ sub(r0, r0, 2, Assembler::LO); // Place -1
+  __ bind(end);
+
+  __ reg_printf("Result of comparison is %d\n", r0);
+}
+
+void TemplateTable::float_cmp(bool is_float, int unordered_result)
+{
+    if(hasFPU()) {
+        if (is_float) {
+         __ pop_f(d1);
+         __ vcmp_f32(d1, d0);
+       } else {
+         __ pop_d(d1);
+         /*__ vmov_f64(r0, r1, d0);
+         __ vmov_f64(r2, r3, d1);
+         __ reg_printf("Doing comparison cmp( 0x%08x%08x,\n", r3, r2);
+         __ reg_printf("                      0x%08x%08x)\n", r1, r0);*/
+         __ vcmp_f64(d1, d0);
+       }
+       __ vmrs(rscratch1);
+       __ andr(rscratch1, rscratch1, Assembler::FP_MASK);
+       __ reg_printf("Masked comparison result is %08x\n", rscratch1);
+
+       if (unordered_result < 0) {
+         // we want -1 for unordered or less than, 0 for equal and 1 for
+         // greater than.
+         __ mov(r0, -1);
+         __ cmp(rscratch1, Assembler::FP_EQ);
+         __ mov(r0, 0, Assembler::EQ);
+         __ cmp(rscratch1, Assembler::FP_GT);
+         __ mov(r0, 1, Assembler::EQ);
+         __ reg_printf("un_res < 0, comparison result is %d\n", r0);
+       } else {
+         // we want -1 for less than, 0 for equal and 1 for unordered or
+         // greater than.
+         __ mov(r0, 1);
+         __ cmp(rscratch1, Assembler::FP_LT);
+         __ sub(r0, r0, 2, Assembler::EQ); //Load -1 - but one less instruction
+         __ cmp(rscratch1, Assembler::FP_EQ);
+         __ mov(r0, 0, Assembler::EQ);
+         __ reg_printf("un_res >= 0, comparison result is %d\n", r0);
+       }
+    } else { // hasFPU
+#ifdef __SOFTFP__
+        if (is_float) {
+            __ mov(r1, r0);
+            __ pop_i(r0);
+            if (unordered_result < 0) {
+                __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fcmpl), 0);
+            } else {
+                __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::fcmpg), 0);
+            }
+        } else {
+            __ mov(r2, r0);
+            __ mov(r3, r1);
+            __ pop_l(r0);
+            if (unordered_result < 0) {
+                __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcmpl), 0);
+            } else {
+                __ call_VM_leaf_base(CAST_FROM_FN_PTR(address, SharedRuntime::dcmpg), 0);
+            }
+        }
+#else
+        // expected -mfloat-abi=soft
+        ShouldNotReachHere();
+#endif
+  }
+}
+
+void TemplateTable::branch(bool is_jsr, bool is_wide)
+{
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
+  __ profile_taken_branch(r0, r1);
+  const ByteSize be_offset = MethodCounters::backedge_counter_offset() +
+                             InvocationCounter::counter_offset();
+  const ByteSize inv_offset = MethodCounters::invocation_counter_offset() +
+                              InvocationCounter::counter_offset();
+
+  // load branch displacement
+  if (!is_wide) {
+    __ ldrh(r2, at_bcp(1));
+    __ rev16(r2, r2);
+    // sign extend the 16 bit value in r2
+    __ sxth(r2, r2);
+  } else {
+    __ ldr(r2, at_bcp(1));
+    __ rev(r2, r2);
+  }
+
+  // Handle all the JSR stuff here, then exit.
+  // It's much shorter and cleaner than intermingling with the non-JSR
+  // normal-branch stuff occurring below.
+
+  if (is_jsr) {
+    // Pre-load the next target bytecode into rscratch1
+    __ load_unsigned_byte(rscratch1, Address(rbcp, r2));
+    // compute return address as bci
+    __ ldr(rscratch2, Address(rmethod, Method::const_offset()));
+    __ add(rscratch2, rscratch2,
+           in_bytes(ConstMethod::codes_offset()) - (is_wide ? 5 : 3));
+    __ sub(r1, rbcp, rscratch2);
+    __ push_i(r1);
+    // Adjust the bcp by the 16-bit displacement in r2
+    __ add(rbcp, rbcp, r2);
+    __ dispatch_only(vtos, /*generate_poll*/true);
+    return;
+  }
+
+  // Normal (non-jsr) branch handling
+
+  // Adjust the bcp by the displacement in r2
+  __ add(rbcp, rbcp, r2);
+
+  assert(UseLoopCounter || !UseOnStackReplacement,
+         "on-stack-replacement requires loop counters");
+  Label backedge_counter_overflow;
+  Label profile_method;
+  Label dispatch;
+  if (UseLoopCounter) {
+    // increment backedge counter for backward branches
+    // r0: MDO
+    // w1: MDO bumped taken-count
+    // r2: target offset
+    __ cmp(r2, 0);
+    __ b(dispatch, Assembler::GT); // count only if backward branch
+
+    // ECN: FIXME: This code smells
+    // check if MethodCounters exists
+    Label has_counters;
+    __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+    __ cbnz(rscratch1, has_counters);
+    __ push(r0);
+    __ push(r1);
+    __ push(r2);
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+            InterpreterRuntime::build_method_counters), rmethod);
+    __ pop(r2);
+    __ pop(r1);
+    __ pop(r0);
+    __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+    __ cbz(rscratch1, dispatch); // No MethodCounters allocated, OutOfMemory
+    __ bind(has_counters);
+
+    if (TieredCompilation) {
+      Label no_mdo;
+      int increment = InvocationCounter::count_increment;
+      if (ProfileInterpreter) {
+        // Are we profiling?
+        __ ldr(r1, Address(rmethod, in_bytes(Method::method_data_offset())));
+        __ cbz(r1, no_mdo);
+        // Increment the MDO backedge counter
+        const Address mdo_backedge_counter(r1, in_bytes(MethodData::backedge_counter_offset()) +
+                                           in_bytes(InvocationCounter::counter_offset()));
+        const Address mask(r1, in_bytes(MethodData::backedge_mask_offset()));
+        __ increment_mask_and_jump(mdo_backedge_counter, increment, mask,
+                                   r0, rscratch2, false, Assembler::EQ,
+                UseOnStackReplacement ? &backedge_counter_overflow : NULL);
+        __ b(dispatch);
+      }
+      __ bind(no_mdo);
+      // Increment backedge counter in MethodCounters*
+      __ ldr(rscratch1, Address(rmethod, Method::method_counters_offset()));
+      const Address mask(rscratch1, in_bytes(MethodCounters::backedge_mask_offset()));
+      __ increment_mask_and_jump(Address(rscratch1, be_offset), increment, mask,
+                                 r0, rscratch2, false, Assembler::EQ,
+              UseOnStackReplacement ? &backedge_counter_overflow : NULL);
+    } else { // not TieredCompilation
+      // increment counter
+      __ ldr(rscratch2, Address(rmethod, Method::method_counters_offset()));
+      __ ldr(r0, Address(rscratch2, be_offset));        // load backedge counter
+      __ add(rscratch1, r0, InvocationCounter::count_increment); // increment counter
+      __ str(rscratch1, Address(rscratch2, be_offset));        // store counter
+
+      __ ldr(r0, Address(rscratch2, inv_offset));    // load invocation counter
+      __ mov(rscratch1, (unsigned)InvocationCounter::count_mask_value);
+      __ andr(r0, r0, rscratch1); // and the status bits
+      __ ldr(rscratch1, Address(rscratch2, be_offset));        // load backedge counter
+      __ add(r0, r0, rscratch1);        // add both counters
+
+      if (ProfileInterpreter) {
+        // Test to see if we should create a method data oop
+        __ ldr(rscratch1, Address(rscratch2, in_bytes(MethodCounters::interpreter_profile_limit_offset())));
+        __ cmp(r0, rscratch1);
+        __ b(dispatch, Assembler::LT);
+
+        // if no method data exists, go to profile method
+        __ test_method_data_pointer(r0, profile_method);
+
+        if (UseOnStackReplacement) {
+          // check for overflow against w1 which is the MDO taken count
+          __ ldr(rscratch1, Address(rscratch2, in_bytes(MethodCounters::interpreter_backward_branch_limit_offset())));
+          __ cmp(r1, rscratch1);
+          __ b(dispatch, Assembler::LO); // Intel == Assembler::below
+
+          // When ProfileInterpreter is on, the backedge_count comes
+          // from the MethodData*, which value does not get reset on
+          // the call to frequency_counter_overflow().  To avoid
+          // excessive calls to the overflow routine while the method is
+          // being compiled, add a second test to make sure the overflow
+          // function is called only once every overflow_frequency.
+          const int overflow_frequency = 1024;
+          const int of_mask_lsb = exact_log2(overflow_frequency);
+          __ bfc(r1, of_mask_lsb, 32 - of_mask_lsb);
+          __ cmp(r1, 0);
+          __ b(backedge_counter_overflow, Assembler::EQ);
+
+        }
+      } else {
+        if (UseOnStackReplacement) {
+          // check for overflow against w0, which is the sum of the
+          // counters
+          __ ldr(rscratch1, Address(rscratch2, in_bytes(MethodCounters::interpreter_backward_branch_limit_offset())));
+          __ cmp(r0, rscratch1);
+          __ b(backedge_counter_overflow, Assembler::HS); // Intel == Assembler::aboveEqual
+        }
+      }
+    }
+  }
+  __ bind(dispatch);
+
+  // Pre-load the next target bytecode into rscratch1
+  __ load_unsigned_byte(rscratch1, Address(rbcp, 0));
+
+  // continue with the bytecode @ target
+  // rscratch1: target bytecode
+  // rbcp: target bcp
+  __ dispatch_only(vtos, /*generate_poll*/true);
+
+  if (UseLoopCounter) {
+    if (ProfileInterpreter) {
+      // Out-of-line code to allocate method data oop.
+      __ bind(profile_method);
+      __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::profile_method));
+      __ load_unsigned_byte(r1, Address(rbcp, 0));  // restore target bytecode
+      __ set_method_data_pointer_for_bcp();
+      __ b(dispatch);
+    }
+
+    if (UseOnStackReplacement) {
+      // invocation counter overflow
+      __ bind(backedge_counter_overflow);
+      __ neg(r2, r2);
+      __ add(r2, r2, rbcp);     // branch bcp
+      // IcoResult frequency_counter_overflow([JavaThread*], address branch_bcp)
+      __ call_VM(noreg,
+                 CAST_FROM_FN_PTR(address,
+                                  InterpreterRuntime::frequency_counter_overflow),
+                 r2);
+
+      __ load_unsigned_byte(r1, Address(rbcp, 0));  // restore target bytecode
+
+      // r0: osr nmethod (osr ok) or NULL (osr not possible)
+      // r1: target bytecode
+      // r2: scratch
+      __ cbz(r0, dispatch);     // test result -- no osr if null
+      // nmethod may have been invalidated (VM may block upon call_VM return)
+      __ ldr(r2, Address(r0, nmethod::state_offset()));
+      __ subs(r2, r2, nmethod::in_use);
+      __ b(dispatch, Assembler::NE);
+
+      // We have the address of an on stack replacement routine in r0
+      // We need to prepare to execute the OSR method. First we must
+      // migrate the locals and monitors off of the stack.
+
+      __ mov(r4, r0);               // save the nmethod
+
+      call_VM(noreg, CAST_FROM_FN_PTR(address, SharedRuntime::OSR_migration_begin));
+
+      // r0 is OSR buffer, ensure it's in the expected parameter location
+      assert(j_rarg0 == r0, "assumed");
+
+      // remove activation
+      // get sender sp
+      __ ldr(rscratch1,
+          Address(rfp, frame::get_interpreter_frame_sender_sp_offset() * wordSize));
+      // remove frame anchor
+      __ leave();
+      __ mov(sp, rscratch1);
+      // Ensure compiled code always sees stack at proper alignment
+      __ align_stack();
+
+      // and begin the OSR nmethod
+      __ ldr(rscratch1, Address(r4, nmethod::osr_entry_point_offset()));
+      __ b(rscratch1);
+    }
+  }
+}
+
+
+void TemplateTable::if_0cmp(Condition cc)
+{
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  /*if (cc == equal) {
+    __ cmp(r0, 0);
+    __ b(not_taken, Assembler::NE);
+  } else if (cc == not_equal) {
+    __ cmp(r0, 0);
+    __ b(not_taken, Assembler::EQ);
+  } else {
+    __ ands(rscratch1, r0, r0);
+    __ b(not_taken, j_not(cc));
+  }*/
+  __ cmp(r0, 0);
+  __ b(not_taken, j_not(cc));
+
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_icmp(Condition cc)
+{
+  transition(itos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_i(r1);
+  __ reg_printf("Comparing TOS = %p, and SOS = %p\n", r0, r1);
+  __ cmp(r1, r0);
+  __ b(not_taken, j_not(cc));
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_nullcmp(Condition cc)
+{
+  transition(atos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  if (cc == equal)
+    __ cbnz(r0, not_taken);
+  else
+    __ cbz(r0, not_taken);
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::if_acmp(Condition cc)
+{
+  transition(atos, vtos);
+  // assume branch is more often taken than not (loops use backward branches)
+  Label not_taken;
+  __ pop_ptr(r1);
+  __ cmpoop(r1, r0);
+  __ b(not_taken, j_not(cc));
+  branch(false, false);
+  __ bind(not_taken);
+  __ profile_not_taken_branch(r0);
+}
+
+void TemplateTable::ret() {
+  transition(vtos, vtos);
+  // We might be moving to a safepoint.  The thread which calls
+  // Interpreter::notice_safepoints() will effectively flush its cache
+  // when it makes a system call, but we need to do something to
+  // ensure that we see the changed dispatch table.
+  __ membar(MacroAssembler::LoadLoad);
+
+  locals_index(r1);
+  __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
+  __ profile_ret(r1, r2);
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));
+  __ lea(rbcp, Address(rbcp, r1));
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));
+  __ dispatch_next(vtos, 0, /*generate_poll*/true);
+}
+
+void TemplateTable::wide_ret() {
+  transition(vtos, vtos);
+  locals_index_wide(r1);
+  __ ldr(r1, aaddress(r1)); // get return bci, compute return bcp
+  __ profile_ret(r1, r2);
+  __ ldr(rbcp, Address(rmethod, Method::const_offset()));
+  __ lea(rbcp, Address(rbcp, r1));
+  __ add(rbcp, rbcp, in_bytes(ConstMethod::codes_offset()));
+  __ dispatch_next(vtos, 0, /*generate_poll*/true);
+}
+
+
+void TemplateTable::tableswitch() {
+  Label default_case, continue_execution;
+  transition(itos, vtos);
+  // align rbcp
+  __ lea(r1, at_bcp(BytesPerInt));
+  __ bic(r1, r1, BytesPerInt - 1);
+  // load lo & hi
+  __ ldr(r2, Address(r1, BytesPerInt));
+  __ ldr(r3, Address(r1, 2 * BytesPerInt));
+  __ rev(r2, r2);
+  __ rev(r3, r3);
+  // check against lo & hi
+  __ cmp(r0, r2);
+  __ b(default_case, Assembler::LT);
+  __ cmp(r0, r3);
+  __ b(default_case, Assembler::GT);
+  // lookup dispatch offset
+  __ sub(r0, r0, r2);
+  __ lea(r3, Address(r1, r0, lsl(2)));
+  __ ldr(r3, Address(r3, 3 * BytesPerInt));
+  __ profile_switch_case(r0, r1, r2);
+  // continue execution
+  __ bind(continue_execution);
+  __ rev(r3, r3);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, r3));
+  __ add(rbcp, rbcp, r3);
+  __ dispatch_only(vtos, /*generate_poll*/true);
+  // handle default
+  __ bind(default_case);
+  __ profile_switch_default(r0);
+  __ ldr(r3, Address(r1, 0));
+  __ b(continue_execution);
+}
+
+void TemplateTable::lookupswitch() {
+  transition(itos, itos);
+  __ stop("lookupswitch bytecode should have been rewritten");
+}
+
+void TemplateTable::fast_linearswitch() {
+  transition(itos, vtos);
+  Label loop_entry, loop, found, continue_execution;
+
+  __ reg_printf("Linearswitching to value %d\n", r0);
+
+  // bswap r0 so we can avoid bswapping the table entries
+  __ rev(r0, r0);
+  // align rbcp
+  __ lea(r14, at_bcp(BytesPerInt)); // btw: should be able to get rid of
+                                    // this instruction (change offsets
+                                    // below)
+  __ bic(r14, r14, BytesPerInt - 1);
+  // set counter
+  __ ldr(r1, Address(r14, BytesPerInt));
+  __ rev(r1, r1);
+  __ b(loop_entry);
+  // table search
+  __ bind(loop);
+  __ lea(rscratch1, Address(r14, r1, lsl(3)));
+  __ ldr(rscratch1, Address(rscratch1, 2 * BytesPerInt));
+  __ cmp(r0, rscratch1);
+  __ b(found, Assembler::EQ);
+  __ bind(loop_entry);
+  __ subs(r1, r1, 1);
+  __ b(loop, Assembler::PL);
+  // default case
+  __ profile_switch_default(r0);
+  __ ldr(r3, Address(r14, 0));
+  __ b(continue_execution);
+  // entry found -> get offset
+  __ bind(found);
+  __ lea(rscratch1, Address(r14, r1, lsl(3)));
+  __ ldr(r3, Address(rscratch1, 3 * BytesPerInt));
+  __ profile_switch_case(r1, r0, r14);
+  // continue execution
+  __ bind(continue_execution);
+  __ rev(r3, r3);
+  __ add(rbcp, rbcp, r3);
+  __ ldrb(rscratch1, Address(rbcp, 0));
+  __ dispatch_only(vtos, /*generate_poll*/true);
+}
+
+void TemplateTable::fast_binaryswitch() {
+  transition(itos, vtos);
+  // Implementation using the following core algorithm:
+  //
+  // int binary_search(int key, LookupswitchPair* array, int n) {
+  //   // Binary search according to "Methodik des Programmierens" by
+  //   // Edsger W. Dijkstra and W.H.J. Feijen, Addison Wesley Germany 1985.
+  //   int i = 0;
+  //   int j = n;
+  //   while (i+1 < j) {
+  //     // invariant P: 0 <= i < j <= n and (a[i] <= key < a[j] or Q)
+  //     // with      Q: for all i: 0 <= i < n: key < a[i]
+  //     // where a stands for the array and assuming that the (inexisting)
+  //     // element a[n] is infinitely big.
+  //     int h = (i + j) >> 1;
+  //     // i < h < j
+  //     if (key < array[h].fast_match()) {
+  //       j = h;
+  //     } else {
+  //       i = h;
+  //     }
+  //   }
+  //   // R: a[i] <= key < a[i+1] or Q
+  //   // (i.e., if key is within array, i is the correct index)
+  //   return i;
+  // }
+
+  // Register allocation
+  const Register key   = r0; // already set (tosca)
+  const Register array = r1;
+  const Register i     = r2;
+  const Register j     = r3;
+  const Register h     = rscratch1;
+  const Register temp  = rscratch2;
+
+  // Find array start
+  __ lea(array, at_bcp(3 * BytesPerInt)); // btw: should be able to
+                                          // get rid of this
+                                          // instruction (change
+                                          // offsets below)
+  __ bic(array, array, BytesPerInt - 1);
+
+  // Initialize i & j
+  __ mov(i, 0);                            // i = 0;
+  __ ldr(j, Address(array, -BytesPerInt)); // j = length(array);
+
+  // Convert j into native byteordering
+  __ rev(j, j);
+
+  // And start
+  Label entry;
+  __ b(entry);
+
+  // binary search loop
+  {
+    Label loop;
+    __ bind(loop);
+    // int h = (i + j) >> 1;
+    __ add(h, i, j);                           // h = i + j;
+    __ lsr(h, h, 1);                                   // h = (i + j) >> 1;
+    // if (key < array[h].fast_match()) {
+    //   j = h;
+    // } else {
+    //   i = h;
+    // }
+    // Convert array[h].match to native byte-ordering before compare
+    __ ldr(temp, Address(array, h, lsl(3)));
+    __ rev(temp, temp);
+    __ cmp(key, temp);
+    // j = h if (key <  array[h].fast_match())
+    __ mov(j, h, Assembler::LT);
+    // i = h if (key >= array[h].fast_match())
+    __ mov(i, h, Assembler::GE);
+    // while (i+1 < j)
+    __ bind(entry);
+    __ add(h, i, 1);          // i+1
+    __ cmp(h, j);             // i+1 < j
+    __ b(loop, Assembler::LT);
+  }
+
+  // end of binary search, result index is i (must check again!)
+  Label default_case;
+  // Convert array[i].match to native byte-ordering before compare
+  __ ldr(temp, Address(array, i, lsl(3)));
+  __ rev(temp, temp);
+  __ cmp(key, temp);
+  __ b(default_case, Assembler::NE);
+
+  // entry found -> j = offset
+  __ add(j, array, i, lsl(3));
+  __ ldr(j, Address(j, BytesPerInt));
+  __ profile_switch_case(i, key, array);
+  __ rev(j, j);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, j));
+  __ lea(rbcp, Address(rbcp, j));
+  __ dispatch_only(vtos, /*generate_poll*/true);
+
+  // default case -> j = default offset
+  __ bind(default_case);
+  __ profile_switch_default(i);
+  __ ldr(j, Address(array, -2 * BytesPerInt));
+  __ rev(j, j);
+  __ load_unsigned_byte(rscratch1, Address(rbcp, j));
+  __ lea(rbcp, Address(rbcp, j));
+  __ dispatch_only(vtos, /*genrate_poll*/true);
+}
+
+void TemplateTable::_return(TosState state)
+{
+  __ reg_printf("STARTING RETURN\n");
+  //__ stop("_return");
+  transition(state, state);
+  if(ltos == state) {
+    __ reg_printf("Doing long return, tos value is 0x%08x%08x\n", r1, r0);
+  } else if ( itos == state || atos == state) {
+    __ reg_printf("Doing int/ref return, tos value is 0x%08x\n", r0);
+  }
+
+  assert(_desc->calls_vm(),
+         "inconsistent calls_vm information"); // call in remove_activation
+
+  if (_desc->bytecode() == Bytecodes::_return_register_finalizer) {
+    assert(state == vtos, "only valid state");
+
+    __ reg_printf("A\n");
+    __ ldr(c_rarg1, aaddress(0));
+    __ reg_printf("object is = %p\nB\n", c_rarg1);
+    __ load_klass(r3, c_rarg1);
+    __ reg_printf("C\n");
+    __ ldr(r3, Address(r3, Klass::access_flags_offset()));
+    __ reg_printf("D\n");
+    __ tst(r3, JVM_ACC_HAS_FINALIZER);
+    __ reg_printf("E\n");
+    Label skip_register_finalizer;
+    __ b(skip_register_finalizer, Assembler::EQ);
+    __ reg_printf("About to call into the VM\n");
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::register_finalizer), c_rarg1);
+    __ reg_printf("F\n");
+    __ bind(skip_register_finalizer);
+  }
+
+  // Issue a StoreStore barrier after all stores but before return
+  // from any constructor for any class with a final field.  We don't
+  // know if this is a finalizer, so we always do so.
+  if (_desc->bytecode() == Bytecodes::_return)
+    __ membar(MacroAssembler::StoreStore);
+
+  // Narrow result if state is itos but result type is smaller.
+  // Need to narrow in the return bytecode rather than in generate_return_entry
+  // since compiled code callers expect the result to already be narrowed.
+  if (state == itos) {
+    __ narrow(r0);
+  }
+
+  __ reg_printf("About to attmpt to remove activation with rfp = %p\n", rfp);
+  __ remove_activation(state);
+  __ reg_printf("Finshed _return, about to jump to lr = %p\n", lr);
+  __ b(lr);
+}
+
+// ----------------------------------------------------------------------------
+// Volatile variables demand their effects be made known to all CPU's
+// in order.  Store buffers on most chips allow reads & writes to
+// reorder; the JMM's ReadAfterWrite.java test fails in -Xint mode
+// without some kind of memory barrier (i.e., it's not sufficient that
+// the interpreter does not reorder volatile references, the hardware
+// also must not reorder them).
+//
+// According to the new Java Memory Model (JMM):
+// (1) All volatiles are serialized wrt to each other.  ALSO reads &
+//     writes act as aquire & release, so:
+// (2) A read cannot let unrelated NON-volatile memory refs that
+//     happen after the read float up to before the read.  It's OK for
+//     non-volatile memory refs that happen before the volatile read to
+//     float down below it.
+// (3) Similar a volatile write cannot let unrelated NON-volatile
+//     memory refs that happen BEFORE the write float down to after the
+//     write.  It's OK for non-volatile memory refs that happen after the
+//     volatile write to float up before it.
+//
+// We only put in barriers around volatile refs (they are expensive),
+// not _between_ memory refs (that would require us to track the
+// flavor of the previous memory refs).  Requirements (2) and (3)
+// require some barriers before volatile stores and after volatile
+// loads.  These nearly cover requirement (1) but miss the
+// volatile-store-volatile-load case.  This final case is placed after
+// volatile-stores although it could just as well go before
+// volatile-loads.
+
+//Note none of these calls use rscratch1, well some do but are set again before return
+// so index can be rscratch1 ( I think )
+void TemplateTable::resolve_cache_and_index(int byte_no,
+                                            Register Rcache,
+                                            Register index,
+                                            size_t index_size) {
+  // Note none of the functions called here use any rscratch
+  // call_VM may do but will save the argument first!
+  const Register temp = rscratch2;
+  assert_different_registers(Rcache, index, temp);
+
+  Label resolved;
+
+  Bytecodes::Code code = bytecode();
+  switch (code) {
+  case Bytecodes::_nofast_getfield: code = Bytecodes::_getfield; break;
+  case Bytecodes::_nofast_putfield: code = Bytecodes::_putfield; break;
+  }
+
+  assert(byte_no == f1_byte || byte_no == f2_byte, "byte_no out of range");
+  __ get_cache_and_index_and_bytecode_at_bcp(Rcache, index, temp, byte_no, 1, index_size);
+  __ cmp(temp, (int) code);  // have we resolved this bytecode?
+  __ b(resolved, Assembler::EQ);
+
+  __ reg_printf("Not resolved, resolving, with rthread = %p, rfp = %p\n", rthread, rfp);
+  // resolve first time through
+  address entry = CAST_FROM_FN_PTR(address, InterpreterRuntime::resolve_from_cache);
+  __ mov(temp, (int) code);
+  __ call_VM(noreg, entry, temp);
+  __ reg_printf("Resolve complete\n");
+
+  // Update registers with resolved info
+  __ get_cache_and_index_at_bcp(Rcache, index, 1, index_size);
+  // n.b. unlike x86 Rcache is now rcpool plus the indexed offset
+  // so all clients ofthis method must be modified accordingly
+  __ bind(resolved);
+}
+
+// The Rcache and index registers must be set before call
+// n.b unlike x86 cache already includes the index offset
+void TemplateTable::load_field_cp_cache_entry(Register obj,
+                                              Register cache,
+                                              Register index,
+                                              Register off,
+                                              Register flags,
+                                              bool is_static = false) {
+  assert_different_registers(cache, index, flags, off);
+
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+  // Field offset
+  __ ldr(off, Address(cache, in_bytes(cp_base_offset +
+                                      ConstantPoolCacheEntry::f2_offset())));
+  // Flags
+  __ ldr(flags, Address(cache, in_bytes(cp_base_offset +
+                                        ConstantPoolCacheEntry::flags_offset())));
+
+  // klass overwrite register
+  if (is_static) {
+    __ ldr(obj, Address(cache, in_bytes(cp_base_offset +
+                                        ConstantPoolCacheEntry::f1_offset())));
+    const int mirror_offset = in_bytes(Klass::java_mirror_offset());
+    __ ldr(obj, Address(obj, mirror_offset));
+    __ resolve_oop_handle(obj, r3);
+  }
+}
+
+void TemplateTable::load_invoke_cp_cache_entry(int byte_no,
+                                               Register method,
+                                               Register itable_index,
+                                               Register flags,
+                                               bool is_invokevirtual,
+                                               bool is_invokevfinal, /*unused*/
+                                               bool is_invokedynamic) {
+  // setup registers
+  const Register cache = rscratch1;
+  const Register index = r14;
+  assert_different_registers(method, flags);
+  assert_different_registers(method, cache, index);
+  assert_different_registers(itable_index, flags);
+  assert_different_registers(itable_index, cache, index);
+  // determine constant pool cache field offsets
+  assert(is_invokevirtual == (byte_no == f2_byte), "is_invokevirtual flag redundant");
+  const int method_offset = in_bytes(
+    ConstantPoolCache::base_offset() +
+      (is_invokevirtual
+       ? ConstantPoolCacheEntry::f2_offset()
+       : ConstantPoolCacheEntry::f1_offset()));
+  const int flags_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::flags_offset());
+  // access constant pool cache fields
+  const int index_offset = in_bytes(ConstantPoolCache::base_offset() +
+                                    ConstantPoolCacheEntry::f2_offset());
+
+  size_t index_size = (is_invokedynamic ? sizeof(u4) : sizeof(u2));
+  resolve_cache_and_index(byte_no, cache, index, index_size);
+  __ ldr(method, Address(cache, method_offset));
+
+  if (itable_index != noreg) {
+    __ ldr(itable_index, Address(cache, index_offset));
+  }
+  __ ldr(flags, Address(cache, flags_offset));
+
+  __ reg_printf("Invocation, index = %d\n", index);
+}
+
+
+// The registers cache and index expected to be set before call.
+// Correct values of the cache and index registers are preserved.
+void TemplateTable::jvmti_post_field_access(Register cache, Register index,
+                                            bool is_static, bool has_tos) {
+  // do the JVMTI work here to avoid disturbing the register state below
+  // We use c_rarg registers here because we want to use the register used in
+  // the call to the VM
+  if (JvmtiExport::can_post_field_access()) {
+    // Check to see if a field access watch has been set before we
+    // take the time to call into the VM.
+    Label L1;
+    assert_different_registers(cache, index, r0);
+    __ lea(rscratch1, ExternalAddress((address) JvmtiExport::get_field_access_count_addr()));
+    __ ldr(r0, Address(rscratch1));
+    __ cmp(r0, 0);
+    __ b(L1, Assembler::EQ);
+
+    __ get_cache_and_index_at_bcp(c_rarg2, c_rarg3, 1);
+    __ lea(c_rarg2, Address(c_rarg2, in_bytes(ConstantPoolCache::base_offset())));
+
+    if (is_static) {
+      __ mov(c_rarg1, 0); // NULL object reference
+    } else {
+      __ ldr(c_rarg1, at_tos()); // get object pointer without popping it
+      __ verify_oop(c_rarg1);
+    }
+    // c_rarg1: object pointer or NULL
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                                       InterpreterRuntime::post_field_access),
+               c_rarg1, c_rarg2, c_rarg3);
+    __ get_cache_and_index_at_bcp(cache, index, 1);
+    __ bind(L1);
+  }
+}
+
+void TemplateTable::pop_and_check_object(Register r)
+{
+  __ pop_ptr(r);
+  __ null_check(r);  // for field access must check obj.
+  __ verify_oop(r);
+}
+
+void TemplateTable::getfield_or_static(int byte_no, bool is_static, RewriteControl rc) {
+  //__ stop("getfield or static");
+  //FIXME Find a better way than this!
+  const Register cache    = r2;
+  const Register index    = r3;
+  const Register obj      = r14;
+  const Register off      = rscratch2; //pop_and_check_object
+  const Register flags    = r0;
+  const Register bc       = r14; // uses same reg as obj, so don't mix them
+  const Register bytecode = r1;
+
+  resolve_cache_and_index(byte_no, cache, index, sizeof(u2));
+  jvmti_post_field_access(cache, index, is_static, false);
+  load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+  if (!is_static) {
+    // obj is on the stack
+    // trashes rscratch1
+    pop_and_check_object(obj);
+  }
+
+  const Address field(obj, off);
+
+  Label Done, notByte, notBool, notInt, notShort, notChar,
+    notLong, notFloat, notObj, notDouble,
+    notVolatileLong, notVolatileDouble, DoneBarrier;
+
+  // Don't rewrite getstatic, only getfield
+  if (is_static) rc = may_not_rewrite;
+
+  __ extract_bits(bytecode, flags, ConstantPoolCacheEntry::tos_state_shift,  ConstantPoolCacheEntry::tos_state_bits);
+
+  assert(btos == 0, "change code, btos != 0");
+  __ cbnz(bytecode, notByte);
+
+  // btos
+  __ access_load_tos_at(T_BYTE, IN_HEAP, field, noreg, noreg);
+  __ push(btos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_bgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notByte);
+  __ cmp(bytecode, ztos);
+  __ b(notBool, Assembler::NE);
+
+  // ztos (same code as btos)
+  __ access_load_tos_at(T_BOOLEAN, IN_HEAP, field, noreg, noreg);
+  __ push(ztos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    // use btos rewriting, no truncating to t/f bit is needed for getfield.
+    patch_bytecode(Bytecodes::_fast_bgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notBool);
+  __ cmp(bytecode, atos);
+  __ b(notObj, Assembler::NE);
+  // atos
+  do_oop_load(_masm, field, r0, IN_HEAP);
+  __ push(atos);
+  __ reg_printf("Getfield or static, atos = 0x%08x\n", r0);
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_agetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notObj);
+  __ cmp(bytecode, itos);
+  __ b(notInt, Assembler::NE);
+  // itos
+  __ access_load_tos_at(T_INT, IN_HEAP, field, noreg, noreg);
+  __ push(itos);
+  __ reg_printf("Getfield or static, itos = 0x%08x\n", r0);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_igetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notInt);
+  __ cmp(bytecode, ctos);
+  __ b(notChar, Assembler::NE);
+  // ctos
+  __ access_load_tos_at(T_CHAR, IN_HEAP, field, noreg, noreg);
+  __ push(ctos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_cgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notChar);
+  __ cmp(bytecode, stos);
+  __ b(notShort, Assembler::NE);
+  // stos
+  __ access_load_tos_at(T_SHORT, IN_HEAP, field, noreg, noreg);
+  __ push(stos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_sgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notShort);
+  __ cmp(bytecode, ltos);
+  __ b(notLong, Assembler::NE);
+  // ltos
+  __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatileLong);
+  __ access_load_tos_at(T_LONG, IN_HEAP | MO_SEQ_CST, field, bytecode, noreg); // don't need bytecode anymore
+  __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+  __ push(ltos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_lgetfield, bc, bytecode);
+  }
+  __ b(DoneBarrier);
+
+  __ bind(notVolatileLong);
+  __ access_load_tos_at(T_LONG, IN_HEAP, field, noreg, noreg);
+  __ push(ltos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_lgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notLong);
+  __ cmp(bytecode, ftos);
+  __ b(notFloat, Assembler::NE);
+  // ftos
+  __ access_load_tos_at(T_FLOAT, IN_HEAP, field, bytecode, noreg); // don't need bytecode anymore
+  __ push(ftos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_fgetfield, bc, bytecode);
+  }
+  __ b(Done);
+
+  __ bind(notFloat);
+#ifdef ASSERT
+  __ cmp(bytecode, dtos);
+  __ b(notDouble, Assembler::NE);
+#endif
+  // dtos
+  __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatileDouble);
+  __ access_load_tos_at(T_DOUBLE, IN_HEAP | MO_SEQ_CST, field, bytecode, noreg); // don't need bytecode anymore
+  __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+  __ push(dtos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_dgetfield, bc, bytecode);
+  }
+  __ b(DoneBarrier);
+
+  __ bind(notVolatileDouble);
+  __ access_load_tos_at(T_DOUBLE, IN_HEAP, field, bytecode, noreg); // don't need bytecode anymore
+  __ push(dtos);
+  // Rewrite bytecode to be faster
+  if (rc == may_rewrite) {
+    patch_bytecode(Bytecodes::_fast_dgetfield, bc, bytecode);
+  }
+  __ b(DoneBarrier);
+#ifdef ASSERT
+
+  __ bind(notDouble);
+  __ stop("Bad state");
+#endif
+
+  __ bind(Done);
+  __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, DoneBarrier);
+  __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+  __ bind(DoneBarrier);
+}
+
+void TemplateTable::getfield(int byte_no) {
+  getfield_or_static(byte_no, false);
+}
+
+void TemplateTable::nofast_getfield(int byte_no) {
+  getfield_or_static(byte_no, false, may_not_rewrite);
+}
+
+void TemplateTable::getstatic(int byte_no) {
+  getfield_or_static(byte_no, true);
+}
+
+// The registers cache and index expected to be set before call.
+// The function may destroy various registers, just not the cache and index registers.
+void TemplateTable::jvmti_post_field_mod(Register cache, Register index, bool is_static) {
+  transition(vtos, vtos);
+
+  ByteSize cp_base_offset = ConstantPoolCache::base_offset();
+
+  if (JvmtiExport::can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before
+    // we take the time to call into the VM.
+    Label L1;
+    assert_different_registers(cache, index, r0);
+    __ lea(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
+    __ ldr(r0, Address(rscratch1));
+    __ cbz(r0, L1);
+
+    __ get_cache_and_index_at_bcp(c_rarg2, rscratch1, 1);
+
+    if (is_static) {
+      // Life is simple.  Null out the object pointer.
+      __ mov(c_rarg1, 0);
+    } else {
+      // Life is harder. The stack holds the value on top, followed by
+      // the object.  We don't know the size of the value, though; it
+      // could be one or two words depending on its type. As a result,
+      // we must find the type to determine where the object is.
+      __ ldr(c_rarg3, Address(c_rarg2,
+                              in_bytes(cp_base_offset +
+                                       ConstantPoolCacheEntry::flags_offset())));
+      __ lsr(c_rarg3, c_rarg3,
+             ConstantPoolCacheEntry::tos_state_shift);
+      ConstantPoolCacheEntry::verify_tos_state_shift();
+      Label nope2, done, ok;
+      __ ldr(c_rarg1, at_tos_p1());  // initially assume a one word jvalue
+      __ cmp(c_rarg3, ltos);
+      __ b(ok, Assembler::EQ);
+      __ cmp(c_rarg3, dtos);
+      __ b(nope2, Assembler::NE);
+      __ bind(ok);
+      __ ldr(c_rarg1, at_tos_p2()); // ltos (two word jvalue)
+      __ bind(nope2);
+    }
+    // cache entry pointer
+    __ add(c_rarg2, c_rarg2, in_bytes(cp_base_offset));
+    // object (tos)
+    __ mov(c_rarg3, sp);
+    // c_rarg1: object pointer set up above (NULL if static)
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_modification),
+               c_rarg1, c_rarg2, c_rarg3);
+    __ get_cache_and_index_at_bcp(cache, index, 1);
+    __ bind(L1);
+  }
+}
+
+void TemplateTable::putfield_or_static(int byte_no, bool is_static, RewriteControl rc) {
+  transition(vtos, vtos);
+  const Register cache = r2;
+  const Register index = rscratch1;
+  const Register obj   = r2;
+  const Register off   = r3;
+  const Register flags = r14;
+  const Register bc    = rscratch2;
+
+  resolve_cache_and_index(byte_no, cache, index, sizeof(u2));
+  __ reg_printf("Putfield or static, index = %d\n", index);
+  jvmti_post_field_mod(cache, index, is_static);
+  load_field_cp_cache_entry(obj, cache, index, off, flags, is_static);
+
+  Label Done;
+  {
+    Label notVolatile;
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ bind(notVolatile);
+  }
+  __ reg_printf("Putfield or static B\n");
+
+  // field address
+  const Address field(obj, off);
+
+  Label notByte, notBool, notInt, notShort, notChar,
+    notLong, notFloat, notObj, notDouble, DoneBarrier;
+
+  __ extract_bits(rscratch1, flags, ConstantPoolCacheEntry::tos_state_shift, ConstantPoolCacheEntry::tos_state_bits);
+
+  __ cmp(rscratch1, btos);
+  __ b(notByte, Assembler::NE);
+
+  // Don't rewrite putstatic, only putfield
+  if (is_static) rc = may_not_rewrite;
+  // btos
+  {
+    __ pop(btos);
+    if (!is_static) {
+      pop_and_check_object(obj);
+    }
+    __ access_store_tos_at(T_BYTE, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_bputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notByte);
+  __ cmp(rscratch1, ztos);
+  __ b(notBool, Assembler::NE);
+
+  // ztos
+  {
+    __ pop(ztos);
+    if (!is_static) pop_and_check_object(obj);
+    __ access_store_tos_at(T_BOOLEAN, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_zputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notBool);
+  __ cmp(rscratch1, atos);
+  __ b(notObj, Assembler::NE);
+
+  // atos
+  {
+    __ pop(atos);
+    if (!is_static) {
+      pop_and_check_object(obj);
+    }
+    // Store into the field
+    do_oop_store(_masm, field, r0, IN_HEAP);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_aputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notObj);
+  __ cmp(rscratch1, itos);
+  __ b(notInt, Assembler::NE);
+
+  // itos
+  {
+    __ pop(itos);
+    if (!is_static) pop_and_check_object(obj);
+    __ access_store_tos_at(T_INT, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_iputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notInt);
+  __ cmp(rscratch1, ctos);
+  __ b(notChar, Assembler::NE);
+
+  // ctos
+  {
+    __ pop(ctos);
+    if (!is_static) {
+      pop_and_check_object(obj);
+    }
+    __ access_store_tos_at(T_CHAR, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_cputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notChar);
+  __ cmp(rscratch1, stos);
+  __ b(notShort, Assembler::NE);
+
+  // stos
+  {
+    __ pop(stos);
+    if (!is_static) {
+      pop_and_check_object(obj);
+    }
+    __ access_store_tos_at(T_SHORT, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_sputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notShort);
+  __ cmp(rscratch1, ltos);
+  __ b(notLong, Assembler::NE);
+
+  // ltos
+  {
+    Label nonVolatileLong;
+    __ pop(ltos);
+    if (!is_static) pop_and_check_object(obj);
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, nonVolatileLong);
+    __ lea(flags, field);
+    __ access_store_tos_at(T_LONG, IN_HEAP | MO_SEQ_CST, Address(flags), r2, r3); // trashes index===rscratch1
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_lputfield, bc, r1, true, byte_no);
+    }
+    __ membar(MacroAssembler::StoreLoad);
+    __ b(DoneBarrier);
+    __ bind(nonVolatileLong);
+    __ access_store_tos_at(T_LONG, IN_HEAP, field, noreg, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_lputfield, bc, r1, true, byte_no);
+    }
+    __ b(DoneBarrier);
+  }
+
+  __ bind(notLong);
+  __ cmp(rscratch1, ftos);
+  __ b(notFloat, Assembler::NE);
+
+  // ftos
+  {
+    __ pop(ftos);
+    if (!is_static) pop_and_check_object(obj);
+    __ access_store_tos_at(T_FLOAT, IN_HEAP, field, index, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_fputfield, bc, r1, true, byte_no);
+    }
+    __ b(Done);
+  }
+
+  __ bind(notFloat);
+#ifdef ASSERT
+  __ cmp(rscratch1, dtos);
+  __ b(notDouble, Assembler::NE);
+#endif // ASSERT
+
+  // dtos
+  {
+    Label nonVolatileDouble;
+    __ pop(dtos);
+    if (!is_static) pop_and_check_object(obj);
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, nonVolatileDouble);
+    __ access_store_tos_at(T_DOUBLE, IN_HEAP | MO_SEQ_CST, field, r2, r3); // trashes index===rscratch1
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_dputfield, bc, r1, true, byte_no);
+    }
+    __ membar(MacroAssembler::StoreLoad);
+    __ b(DoneBarrier);
+    __ bind(nonVolatileDouble);
+    __ access_store_tos_at(T_DOUBLE, IN_HEAP, field, index, noreg);
+    if (rc == may_rewrite) {
+      patch_bytecode(Bytecodes::_fast_dputfield, bc, r1, true, byte_no);
+    }
+    __ b(DoneBarrier);
+  }
+
+#ifdef ASSERT
+  __ b(Done);
+
+  __ bind(notDouble);
+  __ stop("Bad state");
+#endif // ASSERT
+
+  __ bind(Done);
+
+  {
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, DoneBarrier);
+    __ membar(MacroAssembler::StoreLoad);
+  }
+  __ bind(DoneBarrier);
+  //FIXME find a more elegant way!
+  __ get_dispatch();
+}
+
+void TemplateTable::putfield(int byte_no) {
+  putfield_or_static(byte_no, false);
+}
+
+void TemplateTable::nofast_putfield(int byte_no) {
+  putfield_or_static(byte_no, false, may_not_rewrite);
+}
+
+void TemplateTable::putstatic(int byte_no) {
+  putfield_or_static(byte_no, true);
+}
+
+void TemplateTable::jvmti_post_fast_field_mod()
+{
+  if (JvmtiExport::can_post_field_modification()) {
+    // Check to see if a field modification watch has been set before
+    // we take the time to call into the VM.
+    Label L2;
+    __ lea(rscratch1, ExternalAddress((address)JvmtiExport::get_field_modification_count_addr()));
+    __ ldr(c_rarg3, Address(rscratch1));
+    __ cmp(c_rarg3, 0);
+    __ b(L2, Assembler::EQ);
+    __ pop_ptr(r14);                  // copy the object pointer from tos
+    __ verify_oop(r14);
+    __ push_ptr(r14);                 // put the object pointer back on tos
+    // Save tos values before call_VM() clobbers them. Since we have
+    // to do it for every data type, we use the saved values as the
+    // jvalue object.
+    switch (bytecode()) {          // load values into the jvalue object
+    case Bytecodes::_fast_aputfield: __ push_ptr(r0); break;
+    case Bytecodes::_fast_bputfield: // fall through
+    case Bytecodes::_fast_zputfield: // fall through
+    case Bytecodes::_fast_sputfield: // fall through
+    case Bytecodes::_fast_cputfield: // fall through
+    case Bytecodes::_fast_iputfield: __ push_i(r0); break;
+    case Bytecodes::_fast_dputfield:
+        if(hasFPU()) {
+            __ push_d();
+        } else {
+            __ push_l();
+        }
+        break;
+    case Bytecodes::_fast_fputfield:
+        if(hasFPU()) {
+            __ push_f();
+        } else {
+            __ push_i();
+        }
+        break;
+    case Bytecodes::_fast_lputfield: __ push_l(r0); break;
+
+    default:
+      ShouldNotReachHere();
+    }
+    __ mov(c_rarg3, sp);             // points to jvalue on the stack
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(c_rarg2, r0, 1);
+    __ verify_oop(r14);
+    // r14: object pointer copied above
+    // c_rarg2: cache entry pointer
+    // c_rarg3: jvalue object on the stack
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_modification),
+               r14, c_rarg2, c_rarg3);
+
+    switch (bytecode()) {             // restore tos values
+    case Bytecodes::_fast_aputfield: __ pop_ptr(r0); break;
+    case Bytecodes::_fast_fputfield:
+        if(hasFPU()) {
+            __ pop_f(); break;
+        }
+    case Bytecodes::_fast_bputfield: // fall through
+    case Bytecodes::_fast_zputfield: // fall through
+    case Bytecodes::_fast_sputfield: // fall through
+    case Bytecodes::_fast_cputfield: // fall through
+    case Bytecodes::_fast_iputfield: __ pop_i(r0); break;
+    case Bytecodes::_fast_dputfield:
+        if(hasFPU()) {
+            __ pop_d(); break;
+        }
+    case Bytecodes::_fast_lputfield: __ pop_l(r0); break;
+    }
+    __ bind(L2);
+  }
+}
+
+void TemplateTable::fast_storefield(TosState state)
+{
+  transition(state, vtos);
+
+  ByteSize base = ConstantPoolCache::base_offset();
+
+  jvmti_post_fast_field_mod();
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, rscratch1, 1); // index not used
+
+  Register flags = r14;
+  // test for volatile with r14
+  __ ldr(flags, Address(r2, in_bytes(base +
+                                   ConstantPoolCacheEntry::flags_offset())));
+
+  // replace index with field offset from cache entry
+  __ ldr(r3, Address(r2, in_bytes(base + ConstantPoolCacheEntry::f2_offset())));
+
+  // Get object from stack
+  pop_and_check_object(r2);
+
+  // field address
+  const Address field(r2, r3);
+
+  // long and double need special processing, see below
+  // the rest only need barrier before if field is volatile
+  if (bytecode() != Bytecodes::_fast_dputfield && bytecode() != Bytecodes::_fast_lputfield) {
+    Label notVolatile;
+    __ tbz(r14, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ bind(notVolatile);
+  }
+
+  // access field
+  switch (bytecode()) {
+  case Bytecodes::_fast_aputfield:
+    do_oop_store(_masm, field, r0, IN_HEAP);
+    break;
+  case Bytecodes::_fast_dputfield:
+  {
+    Label notVolatile, cont;
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ access_store_tos_at(T_DOUBLE, IN_HEAP | MO_SEQ_CST, field, r2, r3); // trashes rscratch1, ok to reuse r2, r3
+    __ b(cont);
+    __ bind(notVolatile);
+    __ access_store_tos_at(T_DOUBLE, IN_HEAP, field, rscratch1, noreg);
+    __ bind(cont);
+  }
+    break;
+  case Bytecodes::_fast_lputfield:
+  {
+    Label notVolatile, cont;
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreStore);
+    __ access_store_tos_at(T_LONG, IN_HEAP | MO_SEQ_CST, field, r2, r3); // trashes rscratch1, ok to reuse r2, r3
+    __ b(cont);
+    __ bind(notVolatile);
+    __ access_store_tos_at(T_LONG, IN_HEAP, field, rscratch1, noreg);
+    __ bind(cont);
+  }
+    break;
+  case Bytecodes::_fast_fputfield:
+    __ access_store_tos_at(T_FLOAT, IN_HEAP, field, rscratch1, noreg);
+    break;
+  case Bytecodes::_fast_iputfield:
+    __ access_store_tos_at(T_INT, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_zputfield:
+    __ access_store_tos_at(T_BOOLEAN, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_bputfield:
+    __ access_store_tos_at(T_BYTE, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_sputfield:
+    __ access_store_tos_at(T_SHORT, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_cputfield:
+    __ access_store_tos_at(T_CHAR, IN_HEAP, field, noreg, noreg);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  {
+    Label notVolatile;
+    __ tbz(flags, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::StoreLoad);
+    __ bind(notVolatile);
+  }
+}
+
+
+void TemplateTable::fast_accessfield(TosState state)
+{
+  transition(atos, state);
+  // Do the JVMTI work here to avoid disturbing the register state below
+  if (JvmtiExport::can_post_field_access()) {
+    // Check to see if a field access watch has been set before we
+    // take the time to call into the VM.
+    Label L1;
+    __ lea(rscratch1, ExternalAddress((address) JvmtiExport::get_field_access_count_addr()));
+    __ ldr(r2, Address(rscratch1));
+    __ cmp(r2, 0);
+    __ b(L1, Assembler::EQ);
+    // access constant pool cache entry
+    __ get_cache_entry_pointer_at_bcp(c_rarg2, rscratch2, 1);
+    __ verify_oop(r0);
+    __ push_ptr(r0);  // save object pointer before call_VM() clobbers it
+    __ mov(c_rarg1, r0);
+    // c_rarg1: object pointer copied above
+    // c_rarg2: cache entry pointer
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address,
+                                InterpreterRuntime::post_field_access),
+               c_rarg1, c_rarg2);
+    __ pop_ptr(r0); // restore object pointer
+    __ bind(L1);
+  }
+
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, r1, 1);
+  __ ldr(r1, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                  ConstantPoolCacheEntry::f2_offset())));
+  __ ldr(r3, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                  ConstantPoolCacheEntry::flags_offset())));
+
+  // r0: object
+  __ verify_oop(r0);
+  __ null_check(r0);
+  const Address field(r0, r1);
+
+  // access field
+  switch (bytecode()) {
+  case Bytecodes::_fast_agetfield:
+    do_oop_load(_masm, field, r0, IN_HEAP);
+    __ verify_oop(r0);
+    break;
+  case Bytecodes::_fast_dgetfield:
+  {
+    Label notVolatile, cont;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ access_load_tos_at(T_DOUBLE, IN_HEAP | MO_SEQ_CST, field, r2, r3); // trashes rscratch1
+    __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+    __ b(cont);
+    __ bind(notVolatile);
+    __ access_load_tos_at(T_DOUBLE, IN_HEAP, field, rscratch1, noreg);
+    __ bind(cont);
+  }
+    break;
+  case Bytecodes::_fast_lgetfield:
+  {
+    Label notVolatile, cont;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ access_load_tos_at(T_LONG, IN_HEAP | MO_SEQ_CST, field, r2, r3); // trashes rscratch1
+    __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+    __ b(cont);
+    __ bind(notVolatile);
+    __ access_load_tos_at(T_LONG, IN_HEAP, field, noreg, noreg);
+    __ bind(cont);
+  }
+    break;
+  case Bytecodes::_fast_fgetfield:
+    __ access_load_tos_at(T_FLOAT, IN_HEAP, field, rscratch1, noreg);
+    break;
+  case Bytecodes::_fast_igetfield:
+    __ access_load_tos_at(T_INT, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_bgetfield:
+    __ access_load_tos_at(T_BYTE, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_sgetfield:
+    __ access_load_tos_at(T_SHORT, IN_HEAP, field, noreg, noreg);
+    break;
+  case Bytecodes::_fast_cgetfield:
+    __ access_load_tos_at(T_CHAR, IN_HEAP, field, noreg, noreg);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+  // long and double have barrier already placed
+  if (bytecode() != Bytecodes::_fast_dgetfield && bytecode() != Bytecodes::_fast_lgetfield) {
+    Label notVolatile;
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+    __ bind(notVolatile);
+  }
+}
+
+void TemplateTable::fast_xaccess(TosState state)
+{
+  transition(vtos, state);
+
+  // get receiver
+  __ ldr(r0, aaddress(0));
+  // access constant pool cache
+  __ get_cache_and_index_at_bcp(r2, r3, 2);
+  __ ldr(r1, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                  ConstantPoolCacheEntry::f2_offset())));
+  // make sure exception is reported in correct bcp range (getfield is
+  // next instruction)
+  __ add(rbcp, rbcp, 1);
+  __ null_check(r0);
+
+  Address field(r0, r1);
+  switch (state) {
+  case ftos:
+    __ access_load_tos_at(T_FLOAT, IN_HEAP, field, r0, noreg);
+    break;
+  case itos:
+    __ access_load_tos_at(T_INT, IN_HEAP, field, noreg, noreg);
+    break;
+  case atos:
+    do_oop_load(_masm, field, r0, IN_HEAP);
+    __ verify_oop(r0);
+    break;
+  default:
+    ShouldNotReachHere();
+  }
+
+  {
+    Label notVolatile;
+    __ ldr(r3, Address(r2, in_bytes(ConstantPoolCache::base_offset() +
+                                     ConstantPoolCacheEntry::flags_offset())));
+    __ tbz(r3, ConstantPoolCacheEntry::is_volatile_shift, notVolatile);
+    __ membar(MacroAssembler::LoadLoad | MacroAssembler::LoadStore);
+    __ bind(notVolatile);
+  }
+
+  __ sub(rbcp, rbcp, 1);
+}
+
+
+
+//-----------------------------------------------------------------------------
+// Calls
+
+void TemplateTable::count_calls(Register method, Register temp) {
+  // implemented elsewhere
+  ShouldNotReachHere();
+}
+
+void TemplateTable::prepare_invoke(int byte_no,
+                                   Register method, // linked method (or i-klass)
+                                   Register index,  // itable index, MethodType, etc.
+                                   Register recv,   // if caller wants to see it
+                                   Register flags   // if caller wants to test it
+                                   ) {
+  // determine flags
+  Bytecodes::Code code = bytecode();
+  const bool is_invokeinterface  = code == Bytecodes::_invokeinterface;
+  const bool is_invokedynamic    = code == Bytecodes::_invokedynamic;
+  const bool is_invokehandle     = code == Bytecodes::_invokehandle;
+  const bool is_invokevirtual    = code == Bytecodes::_invokevirtual;
+  const bool is_invokespecial    = code == Bytecodes::_invokespecial;
+  const bool load_receiver       = (recv  != noreg);
+  const bool save_flags          = (flags != noreg);
+  assert(load_receiver == (code != Bytecodes::_invokestatic && code != Bytecodes::_invokedynamic), "");
+  assert(save_flags    == (is_invokeinterface || is_invokevirtual), "need flags for vfinal");
+  assert(flags == noreg || flags == r3, "");
+  assert(recv  == noreg || recv  == r2, "");
+
+  // setup registers & access constant pool cache
+  if (recv  == noreg)  recv  = r2;
+  if (flags == noreg)  flags = r3;
+  assert_different_registers(method, index, recv, flags);
+
+  // save 'interpreter return address'
+  __ save_bcp();
+
+  load_invoke_cp_cache_entry(byte_no, method, index, flags, is_invokevirtual, false, is_invokedynamic);
+
+  // maybe push appendix to arguments (just before return address)
+  if (is_invokedynamic || is_invokehandle) {
+    Label L_no_push;
+    __ tbz(flags, ConstantPoolCacheEntry::has_appendix_shift, L_no_push);
+    // Push the appendix as a trailing parameter.
+    // This must be done before we get the receiver,
+    // since the parameter_size includes it.
+    __ push(r14); //NOT NEEDED?!
+    __ mov(r14, index);
+    assert(ConstantPoolCacheEntry::_indy_resolved_references_appendix_offset == 0, "appendix expected at index+0");
+    __ load_resolved_reference_at_index(index, r14);
+    __ pop(r14);
+    __ push(index);  // push appendix (MethodType, CallSite, etc.)
+    __ bind(L_no_push);
+  }
+
+  // load receiver if needed (note: no return address pushed yet)
+  if (load_receiver) {
+    __ andr(recv, flags, ConstantPoolCacheEntry::parameter_size_mask);
+    // const int no_return_pc_pushed_yet = -1;  // argument slot correction before we push return address
+    // const int receiver_is_at_end      = -1;  // back off one slot to get receiver
+    // Address recv_addr = __ argument_address(recv, no_return_pc_pushed_yet + receiver_is_at_end);
+    // __ movptr(recv, recv_addr);
+
+    __ add(rscratch1, sp, recv, lsl(2));
+    __ ldr(recv, Address(rscratch1, -Interpreter::expr_offset_in_bytes(1)));
+    __ verify_oop(recv);
+  }
+
+  // compute return type
+  // x86 uses a shift and mask or wings it with a shift plus assert
+  // the mask is not needed. aarch32 just uses bitfield extract
+  __ extract_bits(rscratch2, flags, ConstantPoolCacheEntry::tos_state_shift,  ConstantPoolCacheEntry::tos_state_bits);
+  // load return address
+  {
+    const address table_addr = (address) Interpreter::invoke_return_entry_table_for(code);
+    __ mov(rscratch1, table_addr);
+    __ ldr(lr, Address(rscratch1, rscratch2, lsl(2)));
+  }
+}
+
+
+void TemplateTable::invokevirtual_helper(Register index,
+                                         Register recv,
+                                         Register flags)
+{
+  // Uses temporary registers r0, r3
+  assert_different_registers(index, recv, r0, r3);
+  // Test for an invoke of a final method
+  Label notFinal;
+  __ tbz(flags, ConstantPoolCacheEntry::is_vfinal_shift, notFinal);
+
+  __ reg_printf("It's a virtual final call\n");
+  const Register method = index;  // method must be rmethod
+  assert(method == rmethod,
+         "methodOop must be rmethod for interpreter calling convention");
+
+  // do the call - the index is actually the method to call
+  // that is, f2 is a vtable index if !is_vfinal, else f2 is a Method*
+
+  // It's final, need a null check here!
+  __ null_check(recv);
+
+  // profile this call
+  __ profile_final_call(r0);
+  __ profile_arguments_type(r0, method, rscratch2, true);
+
+  __ jump_from_interpreted(method, r0);
+
+  __ bind(notFinal);
+  __ reg_printf("It's not a virtual final call\n");
+  // get receiver klass
+  __ null_check(recv, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r0, recv);
+
+  // profile this call
+  __ profile_virtual_call(r0, rlocals, r3);
+
+  // get target methodOop & entry point
+  __ lookup_virtual_method(r0, index, method);
+  __ profile_arguments_type(r3, method, rscratch2, true);
+
+  __ jump_from_interpreted(method, r3);
+}
+
+void TemplateTable::invokevirtual(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f2_byte, "use this argument");
+
+  __ reg_printf("Invokevirtual, the sp is %p\n", sp);
+  prepare_invoke(byte_no, rmethod, noreg, r2, r3);
+
+  // rmethod: index (actually a Method*)
+  // r2: receiver
+  // r3: flags
+
+  invokevirtual_helper(rmethod, r2, r3);
+}
+
+void TemplateTable::invokespecial(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+  __ ldr(rscratch1, Address(sp));
+  __ reg_printf("Stack pointer is %p, tos word = %p\n", sp, rscratch1);
+
+  prepare_invoke(byte_no, rmethod, noreg,  // get f1 Method*
+                 r2);  // get receiver also for null check
+
+  __ verify_oop(r2);
+  __ null_check(r2);
+
+  // do the call
+  __ profile_call(r0);
+  __ profile_arguments_type(r0, rmethod, rbcp, false);
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::invokestatic(int byte_no)
+{
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod);  // get f1 Method*
+  // do the call
+  __ profile_call(r0);
+  __ profile_arguments_type(r0, rmethod, rscratch2, false);
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::fast_invokevfinal(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f2_byte, "use this argument");
+  __ stop("fast_invokevfinal not used on aarch32");}
+
+void TemplateTable::invokeinterface(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  Register temp = rdispatch; //free at this point and reloaded later
+  prepare_invoke(byte_no, r0, rmethod,  // get f1 Klass*, f2 Method*
+                 r2, r3); // recv, flags
+
+
+  __ create_breakpoint();
+  // r0: interface klass (from f1)
+  // rmethod: method (from f2)
+  // r2: receiver
+  // r3: flags
+
+  // First check for Object case, then private interface method,
+  // then regular interface method.
+
+  // Special case of invokeinterface called for virtual method of
+  // java.lang.Object.  See cpCache.cpp for details.
+  Label notObjectMethod;
+  __ tbz(r3, ConstantPoolCacheEntry::is_forced_virtual_shift, notObjectMethod);
+
+  __ reg_printf("ABC: Invoking invokevirtual_helper\n");
+  invokevirtual_helper(rmethod, r2, r3); //loads lr too
+  __ bind(notObjectMethod);
+
+  Label no_such_interface;
+
+  // Check for private method invocation - indicated by vfinal
+  Label notVFinal;
+  __ tbz(r3, ConstantPoolCacheEntry::is_vfinal_shift, notVFinal);
+
+  // Get receiver klass into r3 - also a null check
+  __ null_check(r2, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r3, r2);
+
+  Label subtype;
+  __ check_klass_subtype(r3, r0, temp, subtype);
+  // If we get here the typecheck failed
+  __ b(no_such_interface);
+  __ bind(subtype);
+
+  __ profile_final_call(r0);
+  __ profile_arguments_type(r0, rmethod, temp, true);
+  __ jump_from_interpreted(rmethod, r0);
+
+  __ bind(notVFinal);
+
+  __ reg_printf("ABC: invokeinterface says 'It's not a method'\n");
+  // Get receiver klass into r3 - also a null check
+  __ restore_locals();
+  __ null_check(r2, oopDesc::klass_offset_in_bytes());
+  __ load_klass(r3, r2);
+
+  Label no_such_method;
+
+  // Preserve method in r1 for throw_AbstractMethodErrorVerbose.
+  __ mov(r1, rmethod);
+  // Receiver subtype check against REFC.
+  // Superklass in r0. Subklass in r3.
+  __ lookup_interface_method(// inputs: rec. class, interface, itable index
+                             r3, r0, noreg,
+                             // outputs: scan temp. reg, scan temp. reg
+                             rbcp, temp,
+                             no_such_interface,
+                             /*return_method=*/false);
+
+
+  // profile this call
+  __ restore_bcp(); // rbcp was destroyed by receiver type check
+  __ profile_virtual_call(r3, temp, r0);
+
+  // Get declaring interface class from method, and itable index
+  __ ldr(r0, Address(rmethod, Method::const_offset()));
+  __ ldr(r0, Address(r0, ConstMethod::constants_offset()));
+  __ ldr(r0, Address(r0, ConstantPool::pool_holder_offset_in_bytes()));
+  __ ldr(rmethod, Address(rmethod, Method::itable_index_offset()));
+  assert(Method::itable_index_max <= 0, "incorrect below");
+  __ add(temp, rmethod, -Method::itable_index_max);
+  __ neg(rmethod, temp);
+
+  // Preserve recvKlass for throw_AbstractMethodErrorVerbose.
+  __ mov(rlocals, r3);
+  __ lookup_interface_method(// inputs: rec. class, interface, itable index
+                             rlocals, r0, rmethod,
+                             // outputs: method, scan temp. reg
+                             rmethod, temp,
+                             no_such_interface);
+
+  // rmethod,: methodOop to call
+  // r2: receiver
+  // Check for abstract method error
+  // Note: This should be done more efficiently via a throw_abstract_method_error
+  //       interpreter entry point and a conditional jump to it in case of a null
+  //       method.
+  __ cbz(rmethod, no_such_method);
+
+  __ profile_arguments_type(r3, rmethod, temp, true);
+
+  // do the call
+  // r2: receiver
+  // rmethod,: methodOop
+  __ jump_from_interpreted(rmethod, r3);
+  __ should_not_reach_here();
+
+  // exception handling code follows...
+  // note: must restore interpreter registers to canonical
+  //       state for exception handling to work correctly!
+
+  __ bind(no_such_method);
+  __ reg_printf("ABC: invokeinterface says 'There's no such method'\n");
+  // throw exception
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+  // Pass arguments for generating a verbose error message.
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address, InterpreterRuntime::throw_AbstractMethodErrorVerbose), r3, r1);
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+
+  __ bind(no_such_interface);
+  __ reg_printf("ABC: invokeinterface says 'There's no such interface'\n");
+  // throw exception
+  __ restore_bcp();      // bcp must be correct for exception handler   (was destroyed)
+  __ restore_locals();   // make sure locals pointer is correct as well (was destroyed)
+  // Pass arguments for generating a verbose error message.
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_IncompatibleClassChangeErrorVerbose), r3, r0);
+  // the call_VM checks for exception, so we should never return here.
+  __ should_not_reach_here();
+  return;
+}
+
+void TemplateTable::invokehandle(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, r0, r2);
+  __ verify_method_ptr(r2);
+  __ verify_oop(r2);
+  __ null_check(r2);
+
+  // FIXME: profile the LambdaForm also
+
+  __ profile_final_call(r3);
+  __ profile_arguments_type(r3, rmethod, rscratch2, true);
+
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+void TemplateTable::invokedynamic(int byte_no) {
+  transition(vtos, vtos);
+  assert(byte_no == f1_byte, "use this argument");
+
+  prepare_invoke(byte_no, rmethod, r0);
+
+  // r0: CallSite object (from cpool->resolved_references[])
+  // rmethod: MH.linkToCallSite method (from f2)
+
+  // Note:  r0_callsite is already pushed by prepare_invoke
+
+  // %%% should make a type profile for any invokedynamic that takes a ref argument
+  // profile this call
+  __ profile_call(rbcp);
+  __ profile_arguments_type(r3, rmethod, rscratch2, false);
+
+  __ verify_oop(r0);
+
+  __ jump_from_interpreted(rmethod, r0);
+}
+
+
+//-----------------------------------------------------------------------------
+// Allocation
+
+void TemplateTable::_new() {
+  transition(vtos, atos);
+
+  __ get_unsigned_2_byte_index_at_bcp(r3, 1);
+  Label slow_case;
+  Label done;
+  Label initialize_header;
+  Label initialize_object; // including clearing the fields
+
+  __ get_cpool_and_tags(r2, r0);
+  // Make sure the class we're about to instantiate has been resolved.
+  // This is done before loading InstanceKlass to be consistent with the order
+  // how Constant Pool is updated (see ConstantPool::klass_at_put)
+  const int tags_offset = Array<u1>::base_offset_in_bytes();
+  __ lea(rscratch1, Address(r0, r3, lsl(0)));
+  __ ldrb(rscratch1, Address(rscratch1, tags_offset));
+  __ cmp(rscratch1, JVM_CONSTANT_Class);
+  __ b(slow_case, Assembler::NE);
+
+  // get InstanceKlass
+  __ load_resolved_klass_at_offset(r2, r3, r2, rscratch1);
+
+  // make sure klass is initialized & doesn't have finalizer
+  // make sure klass is fully initialized
+  __ ldrb(rscratch1, Address(r2, InstanceKlass::init_state_offset()));
+  __ cmp(rscratch1, InstanceKlass::fully_initialized);
+  __ b(slow_case, Assembler::NE);
+
+  // get instance_size in InstanceKlass (scaled to a count of bytes)
+  __ ldr(r3, Address(r2, Klass::layout_helper_offset()));
+  // test to see if it has a finalizer or is malformed in some way
+  __ tbnz(r3, exact_log2(Klass::_lh_instance_slow_path_bit), slow_case);
+
+  // Allocate the instance:
+  //  If TLAB is enabled:
+  //    Try to allocate in the TLAB.
+  //    If fails, go to the slow path.
+  //  Else If inline contiguous allocations are enabled:
+  //    Try to allocate in eden.
+  //    If fails due to heap end, go to slow path.
+  //
+  //  If TLAB is enabled OR inline contiguous is enabled:
+  //    Initialize the allocation.
+  //    Exit.
+  //
+  //  Go to slow path.
+  const bool allow_shared_alloc =
+    Universe::heap()->supports_inline_contig_alloc();
+
+  if (UseTLAB) {
+    __ tlab_allocate(r0, r3, 0, noreg, r1, slow_case);
+
+    if (ZeroTLAB) {
+      // the fields have been already cleared
+      __ b(initialize_header);
+    } else {
+      // initialize both the header and fields
+      __ b(initialize_object);
+    }
+  } else {
+    // Allocation in the shared Eden, if allowed.
+    //
+    // r3: instance size in bytes
+    if (allow_shared_alloc) {
+      __ eden_allocate(r0, r3, 0, r10, slow_case);
+    }
+  }
+
+  // If UseTLAB or allow_shared_alloc are true, the object is created above and
+  // there is an initialize need. Otherwise, skip and go to the slow path.
+  if (UseTLAB || allow_shared_alloc) {
+    // The object is initialized before the header.  If the object size is
+    // zero, go directly to the header initialization.
+    __ bind(initialize_object);
+    __ sub(r3, r3, sizeof(oopDesc));
+    __ cbz(r3, initialize_header);
+
+    // Initialize object fields
+    {
+      __ add(rscratch1, r0, sizeof(oopDesc));
+      __ mov(rscratch2, 0);
+      Label loop;
+      __ bind(loop);
+      __ str(rscratch2, Address(__ post(rscratch1, BytesPerInt)));
+      __ sub(r3, r3, BytesPerInt);
+      __ cbnz(r3, loop);
+    }
+
+    // initialize object header only.
+    __ bind(initialize_header);
+    if (UseBiasedLocking) {
+      __ ldr(rscratch1, Address(r2, Klass::prototype_header_offset()));
+    } else {
+      __ mov(rscratch1, (intptr_t)markOopDesc::prototype());
+    }
+    __ str(rscratch1, Address(r0, oopDesc::mark_offset_in_bytes()));
+    __ mov(rscratch2, 0);
+    __ store_klass_gap(r0, rscratch2);  // zero klass gap for compressed oops - not using
+    // not using compressed oops
+    __ store_klass(r0, r2);      // store klass last
+
+#ifdef DTRACE_ENABLED
+    {
+      SkipIfEqual skip(_masm, &DTraceAllocProbes, false);
+      // Trigger dtrace event for fastpath
+      __ push(atos); // save the return value
+      __ call_VM_leaf(
+           CAST_FROM_FN_PTR(address, SharedRuntime::dtrace_object_alloc), r0);
+      __ pop(atos); // restore the return value
+
+    }
+#endif
+    __ b(done);
+  }
+
+  // slow case
+  __ bind(slow_case);
+  __ get_constant_pool(c_rarg1);
+  __ get_unsigned_2_byte_index_at_bcp(c_rarg2, 1);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::_new), c_rarg1, c_rarg2);
+  __ verify_oop(r0);
+
+  // continue
+  __ bind(done);
+
+  __ reg_printf("New object reference is %p\n", r0);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+}
+
+void TemplateTable::newarray() {
+  transition(itos, atos);
+  __ load_unsigned_byte(c_rarg1, at_bcp(1));
+  __ mov(c_rarg2, r0);
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::newarray),
+          c_rarg1, c_rarg2);
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+}
+
+void TemplateTable::anewarray() {
+  transition(itos, atos);
+  __ get_unsigned_2_byte_index_at_bcp(c_rarg2, 1);
+  __ reg_printf("Index = %d\n", c_rarg2);
+  __ get_constant_pool(c_rarg1);
+  __ mov(c_rarg3, r0);
+  __ reg_printf("About to call InterpreterRuntime::anewarray\n");
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::anewarray),
+          c_rarg1, c_rarg2, c_rarg3);
+  __ reg_printf("Finshed call to InterpreterRuntime::anewarray\n");
+  // Must prevent reordering of stores for object initialization with stores that publish the new object.
+  __ membar(Assembler::StoreStore);
+  __ reg_printf("Finshed anewarray\n");
+}
+
+void TemplateTable::arraylength() {
+  transition(atos, itos);
+  __ null_check(r0, arrayOopDesc::length_offset_in_bytes());
+  __ ldr(r0, Address(r0, arrayOopDesc::length_offset_in_bytes()));
+}
+
+void TemplateTable::checkcast()
+{
+  transition(atos, atos);
+  Label done, is_null, ok_is_subtype, quicked, resolved;
+  __ cbz(r0, is_null);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(r2, r3); // r2=cpool, r3=tags array
+  __ get_unsigned_2_byte_index_at_bcp(r14, 1); // r14=index
+  // See if bytecode has already been quicked
+  __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
+  __ ldrb(r1, Address(rscratch1, r14));
+  __ cmp(r1, JVM_CONSTANT_Class);
+  __ b(quicked, Assembler::EQ);
+
+  __ push(atos); // save receiver for result, and for GC
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(r0, rthread);
+  __ pop(r3); // restore receiver
+  __ b(resolved);
+
+  // Get superklass in r0 and subklass in r3
+  __ bind(quicked);
+  __ mov(r3, r0); // Save object in r3; r0 needed for subtype check
+  __ load_resolved_klass_at_offset(r2, r14, r0, rscratch1); // r0 = klass
+
+  __ bind(resolved);
+  __ load_klass(r1, r3);
+
+  // Generate subtype check.  Blows r2. Object in r3.
+  // Superklass in r0. Subklass in r1.
+  __ gen_subtype_check(r1, ok_is_subtype);
+
+  // Come here on failure
+  __ push(r3);
+  // object is at TOS
+  __ b(Interpreter::_throw_ClassCastException_entry);
+
+  // Come here on success
+  __ bind(ok_is_subtype);
+  __ mov(r0, r3); // Restore object in r3
+
+  // Collect counts on whether this test sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(r2);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+}
+
+void TemplateTable::instanceof() {
+  transition(atos, itos);
+  Label done, is_null, ok_is_subtype, quicked, resolved;
+  __ cbz(r0, is_null);
+
+  // Get cpool & tags index
+  __ get_cpool_and_tags(r2, r3); // r2=cpool, r3=tags array
+  __ get_unsigned_2_byte_index_at_bcp(r14, 1); // r14=index
+
+  // See if bytecode has already been quicked
+  __ add(rscratch1, r3, Array<u1>::base_offset_in_bytes());
+  __ ldrb(r1, Address(rscratch1, r14));
+  __ cmp(r1, JVM_CONSTANT_Class);
+  __ b(quicked, Assembler::EQ);
+
+  __ push(atos); // save receiver for result, and for GC
+  __ push_i(r14); // save index (used if profiling)
+  call_VM(r0, CAST_FROM_FN_PTR(address, InterpreterRuntime::quicken_io_cc));
+  // vm_result_2 has metadata result
+  __ get_vm_result_2(r0, rthread);
+  __ pop_i(r14); // restore index
+  __ pop(r3); // restore receiver
+  __ verify_oop(r3);
+  __ load_klass(r3, r3);
+  __ b(resolved);
+
+  // Get superklass in r0 and subklass in r3
+  __ bind(quicked);
+  __ load_klass(r3, r0);
+  __ load_resolved_klass_at_offset(r2, r14, r0, rscratch1);
+
+  __ bind(resolved);
+
+  // Generate subtype check.  Blows r2.
+  // Superklass in r0.  Subklass in r3.
+  __ gen_subtype_check(r3, ok_is_subtype);
+
+  // Come here on failure
+  __ mov(r0, 0);
+  __ b(done);
+  // Come here on success
+  __ bind(ok_is_subtype);
+  __ mov(r0, 1);
+
+  // Collect counts on whether this test sees NULLs a lot or not.
+  if (ProfileInterpreter) {
+    __ b(done);
+    __ bind(is_null);
+    __ profile_null_seen(r2);
+  } else {
+    __ bind(is_null);   // same as 'done'
+  }
+  __ bind(done);
+  // r0 = 0: obj == NULL or  obj is not an instanceof the specified klass
+  // r0 = 1: obj != NULL and obj is     an instanceof the specified klass
+}
+
+//-----------------------------------------------------------------------------
+// Breakpoints
+void TemplateTable::_breakpoint() {
+  // Note: We get here even if we are single stepping..
+  // jbug inists on setting breakpoints at every bytecode
+  // even if we are in single step mode.
+
+  transition(vtos, vtos);
+
+  // get the unpatched byte code
+  __ get_method(c_rarg1);
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address,
+                              InterpreterRuntime::get_original_bytecode_at),
+             c_rarg1, rbcp);
+  __ push(r0);
+
+  // post the breakpoint event
+  __ call_VM(noreg,
+             CAST_FROM_FN_PTR(address, InterpreterRuntime::_breakpoint),
+             rmethod, rbcp);
+
+  // complete the execution of original bytecode
+  __ pop(rscratch1);
+  __ dispatch_only_normal(vtos);
+}
+
+//-----------------------------------------------------------------------------
+// Exceptions
+
+void TemplateTable::athrow() {
+  transition(atos, vtos);
+  __ null_check(r0);
+  __ b(Interpreter::throw_exception_entry());
+}
+
+//-----------------------------------------------------------------------------
+// Synchronization
+//
+// Note: monitorenter & exit are symmetric routines; which is reflected
+//       in the assembly code structure as well
+//
+// Stack layout:
+//
+// [expressions  ] <--- sp                = expression stack top
+// ..
+// [expressions  ]
+// [monitor entry] <--- monitor block top = expression stack bot
+// ..
+// [monitor entry]
+// [frame data   ] <--- monitor block bot
+// ...
+// [saved rbp    ] <--- rbp
+void TemplateTable::monitorenter()
+{
+  transition(atos, vtos);
+
+  // check for NULL object
+  __ null_check(r0);
+
+  const Address monitor_block_top(
+        rfp, frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+  const Address monitor_block_bot(
+        rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  Label allocated;
+
+  // initialize entry pointer
+  __ mov(c_rarg1, 0); // points to free slot or NULL
+
+  // find a free slot in the monitor block (result in c_rarg1)
+  {
+    Label entry, loop, exit;
+    __ ldr(c_rarg3, monitor_block_top); // points to current entry,
+                                        // starting with top-most entry
+    __ lea(c_rarg2, monitor_block_bot); // points to word before bottom
+
+    __ b(entry);
+
+    __ bind(loop);
+    // check if current entry is used
+    // if not used then remember entry in c_rarg1
+    __ ldr(rscratch1, Address(c_rarg3, BasicObjectLock::obj_offset_in_bytes()));
+    __ cmp(rscratch1, 0);
+    __ mov(c_rarg1, c_rarg3, Assembler::EQ);
+    // check if current entry is for same object
+    __ cmp(r0, rscratch1);
+    // if same object then stop searching
+    __ b(exit, Assembler::EQ);
+    // otherwise advance to next entry
+    __ add(c_rarg3, c_rarg3, entry_size);
+    __ bind(entry);
+    // check if bottom reached
+    __ cmp(c_rarg3, c_rarg2);
+    // if not at bottom then check this entry
+    __ b(loop, Assembler::NE);
+    __ bind(exit);
+  }
+
+  __ cbnz(c_rarg1, allocated); // check if a slot has been found and
+                            // if found, continue with that on
+
+  // allocate one if there's no free slot
+  {
+    Label entry, loop; //, no_adjust;
+    // 1. compute new pointers            // rsp: old expression stack top
+    __ ldr(c_rarg1, monitor_block_bot);   // c_rarg1: old expression stack bottom
+    __ sub(sp, sp, entry_size);           // move expression stack top
+    __ sub(c_rarg1, c_rarg1, entry_size); // move expression stack bottom
+    __ mov(c_rarg3, sp);                  // set start value for copy loop
+    __ str(c_rarg1, monitor_block_bot);   // set new monitor block bottom
+
+    //__ cmp(sp, c_rarg3);                  // Check if we need to move sp
+    //__ b(no_adjust, Assembler::LO);      // to allow more stack space
+                                          // for our new sp
+    //__ sub(sp, sp, 2 * wordSize);
+    //__ bind(no_adjust);
+
+    __ b(entry);
+    // 2. move expression stack contents
+    __ bind(loop);
+    __ ldr(c_rarg2, Address(c_rarg3, entry_size)); // load expression stack
+                                                   // word from old location
+    __ str(c_rarg2, Address(c_rarg3, 0));          // and store it at new location
+    __ add(c_rarg3, c_rarg3, wordSize);            // advance to next word
+    __ bind(entry);
+    __ cmp(c_rarg3, c_rarg1);        // check if bottom reached
+    __ b(loop, Assembler::NE);      // if not at bottom then
+                                     // copy next word
+  }
+
+  // call run-time routine
+  // c_rarg1: points to monitor entry
+  __ bind(allocated);
+
+  // Increment bcp to point to the next bytecode, so exception
+  // handling for async. exceptions work correctly.
+  // The object has already been poped from the stack, so the
+  // expression stack looks correct.
+  __ add(rbcp, rbcp, 1); //inc
+
+  // store object
+  __ str(r0, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+  __ lock_object(c_rarg1);
+
+  // check to make sure this monitor doesn't cause stack overflow after locking
+  __ save_bcp();  // in case of exception
+  __ generate_stack_overflow_check(0);
+
+  // The bcp has already been incremented. Just need to dispatch to
+  // next instruction.
+  __ dispatch_next(vtos);
+}
+
+
+void TemplateTable::monitorexit()
+{
+  transition(atos, vtos);
+
+  // check for NULL object
+  __ null_check(r0);
+
+  const Address monitor_block_top(
+        rfp, frame::get_interpreter_frame_monitor_block_top_offset() * wordSize);
+  const Address monitor_block_bot(
+        rfp, frame::get_interpreter_frame_initial_sp_offset() * wordSize);
+  const int entry_size = frame::interpreter_frame_monitor_size() * wordSize;
+
+  Label found;
+
+  // find matching slot
+  {
+    Label entry, loop;
+    __ ldr(c_rarg1, monitor_block_top); // points to current entry,
+                                        // starting with top-most entry
+    __ lea(c_rarg2, monitor_block_bot); // points to word before bottom
+                                        // of monitor block
+    __ b(entry);
+
+    __ bind(loop);
+    // check if current entry is for same object
+    __ ldr(rscratch1, Address(c_rarg1, BasicObjectLock::obj_offset_in_bytes()));
+    __ cmp(r0, rscratch1);
+    // if same object then stop searching
+    __ b(found, Assembler::EQ);
+    // otherwise advance to next entry
+    __ add(c_rarg1, c_rarg1, entry_size);
+    __ bind(entry);
+    // check if bottom reached
+    __ cmp(c_rarg1, c_rarg2);
+    // if not at bottom then check this entry
+    __ b(loop, Assembler::NE);
+  }
+
+  // error handling. Unlocking was not block-structured
+  __ call_VM(noreg, CAST_FROM_FN_PTR(address,
+                   InterpreterRuntime::throw_illegal_monitor_state_exception));
+  __ should_not_reach_here();
+
+  // call run-time routine
+  __ bind(found);
+  __ push_ptr(r0); // make sure object is on stack (contract with oopMaps)
+  __ unlock_object(c_rarg1);
+  __ pop_ptr(r0); // discard object
+}
+
+
+// Wide instructions
+//J_UPDATE
+void TemplateTable::wide()
+{
+  __ load_unsigned_byte(r14, at_bcp(1));
+  __ mov(rscratch1, (address)Interpreter::_wentry_point);
+  __ ldr(rscratch1, Address(rscratch1, r14, lsl(2)));
+  __ b(rscratch1);
+}
+
+
+// Multi arrays
+//J_UPDATE
+void TemplateTable::multianewarray() {
+  transition(vtos, atos);
+  __ load_unsigned_byte(r0, at_bcp(3)); // get number of dimensions
+  // last dim is on top of stack; we want address of first one:
+  // first_addr = last_addr + (ndims - 1) * wordSize
+  __ lea(c_rarg1, Address(sp, r0, lsl(2)));
+  __ sub(c_rarg1, c_rarg1, wordSize);
+  call_VM(r0,
+          CAST_FROM_FN_PTR(address, InterpreterRuntime::multianewarray),
+          c_rarg1);
+  __ load_unsigned_byte(r1, at_bcp(3));
+  __ lea(sp, Address(sp, r1, lsl(2)));
+}
--- /dev/null	2018-09-25 19:25:30.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/templateTable_aarch32.hpp	2018-09-25 19:25:30.000000000 +0300
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_TEMPLATETABLE_AARCH32_64_HPP
+#define CPU_AARCH32_VM_TEMPLATETABLE_AARCH32_64_HPP
+
+static void prepare_invoke(int byte_no,
+                             Register method,         // linked method (or i-klass)
+                             Register index = noreg,  // itable index, MethodType, etc.
+                             Register recv  = noreg,  // if caller wants to see it
+                             Register flags = noreg   // if caller wants to test it
+                             );
+  static void invokevirtual_helper(Register index, Register recv,
+                                   Register flags);
+
+  // Helpers
+  static void index_check(Register array, Register index);
+  static void index_check_without_pop(Register array, Register index);
+
+#endif // CPU_AARCH32_VM_TEMPLATETABLE_AARCH32_64_HPP
--- /dev/null	2018-09-25 19:25:31.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vmStructs_aarch32.hpp	2018-09-25 19:25:31.000000000 +0300
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2001, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_VMSTRUCTS_AARCH32_HPP
+#define CPU_AARCH32_VM_VMSTRUCTS_AARCH32_HPP
+
+// These are the CPU-specific fields, types and integer
+// constants required by the Serviceability Agent. This file is
+// referenced by vmStructs.cpp.
+
+#define VM_STRUCTS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field)            \
+                                                                                                                                     \
+  /******************************/                                                                                                   \
+  /* JavaCallWrapper            */                                                                                                   \
+  /******************************/                                                                                                   \
+  /******************************/                                                                                                   \
+  /* JavaFrameAnchor            */                                                                                                   \
+  /******************************/                                                                                                   \
+  volatile_nonstatic_field(JavaFrameAnchor,     _last_Java_fp,                                    intptr_t*)
+
+
+#define VM_TYPES_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type)
+
+#define VM_INT_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#define VM_LONG_CONSTANTS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#endif // CPU_AARCH32_VM_VMSTRUCTS_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:32.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vm_version_aarch32.cpp	2018-09-25 19:25:32.000000000 +0300
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "memory/resourceArea.hpp"
+#include "runtime/java.hpp"
+#include "runtime/stubCodeGenerator.hpp"
+#include "utilities/macros.hpp"
+#include "vm_version_aarch32.hpp"
+#include "compiler/disassembler.hpp"
+
+#include OS_HEADER_INLINE(os)
+
+// Next function in another compilation unit to prevent inlining and
+// breaking frame size check
+extern int aarch32_get_fp_sp_distance();
+
+enum ProcessorFeatures VM_Version::_features = FT_NONE;
+
+static BufferBlob* stub_blob;
+static const int stub_size = 550;
+volatile bool VM_Version::_is_determine_features_test_running = false;
+
+extern "C" {
+  typedef void (*getPsrInfo_stub_t)(void*);
+}
+static getPsrInfo_stub_t getPsrInfo_stub = NULL;
+
+typedef unsigned long (*pgetauxval)(unsigned long type);
+
+bool VM_Version::identify_procline(const char *tag, char **line) {
+  char *i = *line;
+  const char EOT = '\t', EOT2 = ':'; // the longest has no tabs
+  for (; '\0' != *i && EOT != *i && EOT2 != *i; i++);
+  if (EOT == *i || EOT2 == *i) {
+    if (!memcmp(*line, tag, i - *line)) {
+      for (i++; (EOT == *i || EOT2 == *i || ' ' == *i) && '\0' != *i; i++);
+      if ('\0' != *i) {
+        *line = i;
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+void VM_Version::get_processor_features() {
+  _supports_cx8 = true;
+  _supports_atomic_getset4 = true;
+  _supports_atomic_getadd4 = true;
+  _supports_atomic_getset8 = true;
+  _supports_atomic_getadd8 = true;
+
+  if (FLAG_IS_DEFAULT(AllocatePrefetchDistance))
+    FLAG_SET_DEFAULT(AllocatePrefetchDistance, 256);
+  if (FLAG_IS_DEFAULT(AllocatePrefetchStepSize))
+    FLAG_SET_DEFAULT(AllocatePrefetchStepSize, 64);
+  FLAG_SET_DEFAULT(PrefetchScanIntervalInBytes, 256);
+  FLAG_SET_DEFAULT(PrefetchFieldsAhead, 256);
+  FLAG_SET_DEFAULT(PrefetchCopyIntervalInBytes, 256);
+
+  enum ProcessorFeatures f = FT_NONE;
+
+  // Allocate space for the code.
+  const int code_size = 11 * Assembler::instruction_size;
+  ResourceMark rm;
+  CodeBuffer cb("detect_cpu_features", code_size, 0);
+  MacroAssembler* a = new MacroAssembler(&cb);
+  jlong test_area;
+
+  // Must be set to true so we can generate the test code.
+  _features = FT_ALL;
+  // Emit code.
+  uint32_t *const code = (uint32_t *)a->pc();
+  void (*test)(address addr, uintptr_t offset)=(void(*)(address addr, uintptr_t nonzero))(void *)code;
+
+  a->udiv(r3, r2, r1);     // FT_HW_DIVIDE
+  a->bfc(r1, 1, 1);        // FT_ARMV6T2
+  a->vneg_f64(d0, d0);     // FT_VFPV2
+  a->vmov_f64(d0, 1.);     // FT_VFPV3
+  a->dmb(Assembler::ISH);  // FT_ARMV7
+  a->ldrexd(r2, r0);       // FT_ARMV6K
+  a->vmov_f64(d0, 0.0);    // FT_AdvSIMD
+  a->crc32b(r3, r2, r1);   // FT_CRC32
+  a->vmov_f64(d16, 1.);    // FT_VFPV3D32
+  a->pldw(Address(r0));    // FT_MP_EXT
+  a->aese(q0, q0);         // FT_AES
+  a->b(lr);
+
+  uint32_t *const code_end = (uint32_t *)a->pc();
+  a->flush();
+  _features = FT_NONE;
+
+  // Print the detection code.
+  if (PrintAssembly) {
+    ttyLocker ttyl;
+    tty->print_cr("Decoding cpu-feature detection stub at " INTPTR_FORMAT " before execution:", p2i(code));
+    Disassembler::decode((u_char*)code, (u_char*)code_end, tty);
+  }
+  // Execute code. Illegal instructions will be replaced by 0 in the signal handler.
+  VM_Version::_is_determine_features_test_running = true;
+  (*test)((address)&test_area, 1);
+  VM_Version::_is_determine_features_test_running = false;
+
+  uint32_t *insn = code;
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_HW_DIVIDE);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_ARMV6T2);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_VFPV2);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_VFPV3);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_ARMV7);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_ARMV6K);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_AdvSIMD);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_CRC32);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_VFPV3D32);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_MP_EXT);
+  if (*insn++ != Assembler::nop_insn) f = (ProcessorFeatures) (f | FT_AES);
+
+  int ncores = 0, cpu = 0, variant = 0, model = 0, revision = 0;
+  char buf[2048], *i;
+  if (FILE * fp = fopen("/proc/cpuinfo", "r")) {
+    while ((i = fgets(buf, 2048, fp))) {
+      if (identify_procline("processor", &i)) {
+        ncores++;
+      } else if (identify_procline("CPU implementer", &i)) {
+        cpu = strtol(i, NULL, 0);
+      } else if (identify_procline("CPU variant", &i)) {
+        variant = strtol(i, NULL, 0);
+      } else if (identify_procline("CPU part", &i)) {
+        model = strtol(i, NULL, 0);
+      } else if (identify_procline("CPU revision", &i)) {
+        revision = strtol(i, NULL, 0);
+      }
+    }
+    fclose(fp);
+  }
+  if (1 == ncores) {
+    f = (ProcessorFeatures) (f | FT_SINGLE_CORE);
+  }
+
+  sprintf(buf, "0x%02x:0x%x:0x%03x:%d", cpu, variant, model, revision);
+  if (f & FT_VFPV2)  strcat(buf, ", vfp");
+  if (f & FT_VFPV3)  strcat(buf, ", vfpv3");
+  if (f & FT_VFPV3D32) strcat(buf, ", vfpd32");
+  if (f & FT_AdvSIMD) strcat(buf, ", simd, neon");
+  if (f & FT_CRC32) strcat(buf, ", crc");
+  if (f & FT_AES)   strcat(buf, ", aes");
+
+  _features_string = os::strdup(buf);
+
+  if (FLAG_IS_DEFAULT(UseCRC32)) {
+    UseCRC32 = (f & FT_CRC32) != 0;
+  }
+  if (UseCRC32 && (f & FT_CRC32) == 0) {
+    warning("UseCRC32 specified, but not supported on this CPU");
+    FLAG_SET_DEFAULT(UseCRC32, false);
+  }
+  if (FLAG_IS_DEFAULT(UseCRC32Intrinsics)) {
+    FLAG_SET_DEFAULT(UseCRC32Intrinsics, true);
+  }
+  if (FLAG_IS_DEFAULT(UseCRC32CIntrinsics)) {
+    FLAG_SET_DEFAULT(UseCRC32CIntrinsics, true);
+  }
+  if ((f & FT_AdvSIMD) && FLAG_IS_DEFAULT(UseNeon) && (model & ~0x0f0) >= 0xc08) {
+    UseNeon = true;
+  }
+  _features = f;
+
+#ifdef COMPILER2
+  if (FLAG_IS_DEFAULT(UseMultiplyToLenIntrinsic)) {
+    FLAG_SET_DEFAULT(UseMultiplyToLenIntrinsic, true);
+  }
+#endif // COMPILER2
+
+  if (f & FT_AdvSIMD) { // don't use UseNeon since Montgomery intrinsics are benefitial even on Cortex-A7
+    if (FLAG_IS_DEFAULT(UseMontgomeryMultiplyIntrinsic)) {
+      FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, true);
+    }
+    if (FLAG_IS_DEFAULT(UseMontgomerySquareIntrinsic)) {
+      FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, true);
+    }
+  } else {
+    if (UseMontgomeryMultiplyIntrinsic || UseMontgomerySquareIntrinsic) {
+      warning("Montgomery intrinsics are not available on this CPU");
+      FLAG_SET_DEFAULT(UseMontgomeryMultiplyIntrinsic, false);
+      FLAG_SET_DEFAULT(UseMontgomerySquareIntrinsic, false);
+    }
+  }
+
+  if (FLAG_IS_DEFAULT(UseSIMDForMemoryOps) && (f & (FT_VFPV2 | FT_AdvSIMD))) {
+    FLAG_SET_DEFAULT(UseSIMDForMemoryOps, true);
+  }
+
+/*  if (FLAG_IS_DEFAULT(UseBarriersForVolatile)) {
+    UseBarriersForVolatile = (_cpuFeatures & CPU_DMB_ATOMICS) != 0;
+  }*/
+
+  /*if(!(f & FT_ARMV7) && FLAG_IS_DEFAULT(UseMembar)) {
+    UseMembar = false;
+  } else if(UseMembar) {
+    fprintf(stderr, "Unable to use memory barriers as not on ARMv7, disabling.\n");
+    UseMembar = false;
+  }*/
+  if (UseAES) {
+    if ((f & FT_AES) == 0)
+      warning("UseAES specified, but not supported on this CPU");
+    else
+      warning("UseAES specified, but not supported");
+    FLAG_SET_DEFAULT(UseAES, false);
+  }
+  if (FLAG_IS_DEFAULT(UseAESIntrinsics)) {
+    UseAESIntrinsics = true;
+  }
+
+  if (UseSHA) {
+    warning("SHA instructions are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA, false);
+  }
+  if (f & FT_AdvSIMD) {
+    if(FLAG_IS_DEFAULT(UseSHA1Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA1Intrinsics, true);
+    }
+    if(FLAG_IS_DEFAULT(UseSHA256Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA256Intrinsics, true);
+    }
+    if(FLAG_IS_DEFAULT(UseSHA512Intrinsics)) {
+      FLAG_SET_DEFAULT(UseSHA512Intrinsics, true);
+    }
+  } else if (UseSHA1Intrinsics || UseSHA256Intrinsics || UseSHA512Intrinsics) {
+    warning("SHA intrinsics are not available on this CPU");
+    FLAG_SET_DEFAULT(UseSHA1Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA256Intrinsics, false);
+    FLAG_SET_DEFAULT(UseSHA512Intrinsics, false);
+  }
+
+}
+
+static bool get_is_thumb() {
+  intptr_t x, y;
+  asm ("mov %0, pc\n"
+       "mov %1, pc": "=r"(x), "=r"(y));
+  return y - x == 2;
+}
+
+void VM_Version::initialize() {
+  ResourceMark rm;
+
+  stub_blob = BufferBlob::create("getPsrInfo_stub", stub_size);
+  if (stub_blob == NULL) {
+    vm_exit_during_initialization("Unable to allocate getPsrInfo_stub");
+  }
+
+  get_processor_features();
+
+  const bool thumb = get_is_thumb();
+
+  if (FLAG_IS_DEFAULT(VMFrameAPCS)) {
+    if (thumb) {
+      FLAG_SET_DEFAULT(VMFrameAPCS, false);
+    } else {
+      const int fp_sp_dist = aarch32_get_fp_sp_distance();
+      //  mov     r12, sp
+      //  push    {r11, r12, lr, pc}
+      //  sub     r11, r12, #4
+      const int apcs_dist = 12;
+
+      assert((0 <= fp_sp_dist) && (fp_sp_dist % 4 == 0), "fp/sp sanity check");
+      assert(fp_sp_dist <= 16, "Assume leaf function should not save many registers in prolog");
+
+      FLAG_SET_DEFAULT(VMFrameAPCS, fp_sp_dist == apcs_dist);
+    }
+  }
+
+  if (FLAG_IS_DEFAULT(JNIFrameAPCS)) {
+    FLAG_SET_DEFAULT(JNIFrameAPCS, VMFrameAPCS);
+  }
+
+  // This machine does not allow a lot of forms of unaligned memory accesses
+  if (FLAG_IS_DEFAULT(UseUnalignedAccesses)) {
+    FLAG_SET_DEFAULT(UseUnalignedAccesses, false);
+  }
+
+  if (FrameAPCS && !FLAG_IS_DEFAULT(PreserveFramePointer) && !PreserveFramePointer) {
+    warning("FrameAPCS enabled, so fp will always hold frame pointer, ignoring disabled PreserveFramePointer!");
+  }
+
+  if (thumb && (VMFrameAPCS || JNIFrameAPCS)) {
+    warning("VM and JNI APCS support is not available when VM is built in Thumb mode");
+  }
+
+  FLAG_SET_DEFAULT(CriticalJNINatives, false);
+#ifndef HARD_FLOAT_CC
+  if( !(VM_Version::features() & (FT_VFPV2 | FT_VFPV3)) ) {
+#ifdef COMPILER2
+    // C2 is only supported on v7+ VFP at this time
+    vm_exit_during_initialization("Server VM is only supported on ARMv7+ VFP");
+#else
+    if(FLAG_IS_CMDLINE(UseFPU)) {
+        warning("FPU is not present on this core");
+    }
+    FLAG_SET_DEFAULT(UseFPU, false);
+#endif
+  }
+#endif
+
+#ifdef COMPILER2
+  if ( !(VM_Version::features() & FT_ARMV7) ) {
+    // C2 is only supported on v7+ VFP at this time
+    vm_exit_during_initialization("Server VM is only supported on ARMv7+");
+  }
+
+  FLAG_SET_DEFAULT(UseFPUForSpilling, true);
+
+  if (FLAG_IS_DEFAULT(MaxVectorSize)) {
+    // FLAG_SET_DEFAULT(MaxVectorSize, has_simd() ? 16 : 8);
+    // SIMD/NEON can use 16, but default is 8 because currently
+    // larger than 8 will disable instruction scheduling
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+
+  if (MaxVectorSize > 16) {
+    FLAG_SET_DEFAULT(MaxVectorSize, 8);
+  }
+#endif
+
+  UNSUPPORTED_OPTION(CriticalJNINatives);
+}
--- /dev/null	2018-09-25 19:25:33.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vm_version_aarch32.hpp	2018-09-25 19:25:33.000000000 +0300
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_VM_VERSION_AARCH32_HPP
+#define CPU_AARCH32_VM_VM_VERSION_AARCH32_HPP
+
+#include "runtime/globals_extension.hpp"
+#include "runtime/vm_version.hpp"
+#include "utilities/sizes.hpp"
+
+enum ProcessorFeatures {
+    FT_NONE = 0,
+    FT_HW_DIVIDE = 1,
+    FT_VFPV2 = 2,
+    FT_VFPV3 = 4,
+    FT_VFPV3D32 = 8,
+    FT_ARMV7 = 16,
+    FT_ARMV6T2 = 32,
+    FT_ARMV6K = 64,
+    FT_SINGLE_CORE = 128,
+    FT_AdvSIMD = 256,
+    FT_CRC32 = 512,
+    FT_MP_EXT = 1024,
+    FT_AES = 2048,
+    FT_ALL = 0xffff
+};
+
+class VM_Version : public Abstract_VM_Version {
+ public:
+  // Processor feature lookup.
+
+  enum {
+    CPU_ARM       = 'A',
+    CPU_BROADCOM  = 'B',
+    CPU_CAVIUM    = 'C',
+    CPU_DEC       = 'D',
+    CPU_INFINEON  = 'I',
+    CPU_MOTOROLA  = 'M',
+    CPU_NVIDIA    = 'N',
+    CPU_AMCC      = 'P',
+    CPU_QUALCOM   = 'Q',
+    CPU_MARVELL   = 'V',
+    CPU_INTEL     = 'i',
+  } cpuFamily;
+
+  // Initialization
+  static void initialize();
+
+ private:
+  static enum ProcessorFeatures _features;
+  static const char* _cpu_features;
+    static volatile bool _is_determine_features_test_running;
+
+  static void get_processor_features();
+  static bool identify_procline(const char *tag, char **line);
+
+ public:
+  static enum ProcessorFeatures features() {
+    return _features;
+  }
+    static void features(ProcessorFeatures f) {
+      _features = f;
+    }
+    static bool is_determine_features_test_running() { return _is_determine_features_test_running; }
+};
+
+#ifdef HARD_FLOAT_CC
+inline const bool hasFPU(void) { return true; }
+#else
+inline bool hasFPU(void) { return (UseFPU); }
+#endif
+
+
+#endif // CPU_AARCH32_VM_VM_VERSION_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:34.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vm_version_aarch32_2.cpp	2018-09-25 19:25:34.000000000 +0300
@@ -0,0 +1,28 @@
+// Copyright 2013-2018 Azul Systems, Inc.  All Rights Reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it under
+// the terms of the GNU General Public License version 2 only, as published by
+// the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT ANY
+// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+// A PARTICULAR PURPOSE.  See the GNU General Public License version 2 for more
+// details (a copy is included in the LICENSE file that accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version 2
+// along with this work; if not, write to the Free Software Foundation, Inc.,
+// 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
+// This file is logical part of vm_version_aarch32.cpp, but contains parts that
+// _should_ be in another compilation unit
+
+int aarch32_get_fp_sp_distance() {
+  register int fp __asm__ ("r11");
+  register int sp __asm__ ("r13");
+  return fp - sp;
+}
--- /dev/null	2018-09-25 19:25:35.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vm_version_ext_aarch32.cpp	2018-09-25 19:25:35.000000000 +0300
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "memory/allocation.hpp"
+#include "memory/allocation.inline.hpp"
+#include "runtime/os.inline.hpp"
+#include "vm_version_ext_aarch32.hpp"
+
+// VM_Version_Ext statics
+int VM_Version_Ext::_no_of_threads = 0;
+int VM_Version_Ext::_no_of_cores = 0;
+int VM_Version_Ext::_no_of_sockets = 0;
+bool VM_Version_Ext::_initialized = false;
+char VM_Version_Ext::_cpu_name[CPU_TYPE_DESC_BUF_SIZE] = {0};
+char VM_Version_Ext::_cpu_desc[CPU_DETAILED_DESC_BUF_SIZE] = {0};
+
+void VM_Version_Ext::initialize_cpu_information(void) {
+  // do nothing if cpu info has been initialized
+  if (_initialized) {
+    return;
+  }
+
+  int core_id = -1;
+  int chip_id = -1;
+  int len = 0;
+  char* src_string = NULL;
+
+  _no_of_cores  = os::processor_count();
+  _no_of_threads = _no_of_cores;
+  _no_of_sockets = _no_of_cores;
+  snprintf(_cpu_name, CPU_TYPE_DESC_BUF_SIZE - 1, "AArch32");
+  snprintf(_cpu_desc, CPU_DETAILED_DESC_BUF_SIZE, "%s", _features_string);
+  _initialized = true;
+}
+
+int VM_Version_Ext::number_of_threads(void) {
+  initialize_cpu_information();
+  return _no_of_threads;
+}
+
+int VM_Version_Ext::number_of_cores(void) {
+  initialize_cpu_information();
+  return _no_of_cores;
+}
+
+int VM_Version_Ext::number_of_sockets(void) {
+  initialize_cpu_information();
+  return _no_of_sockets;
+}
+
+const char* VM_Version_Ext::cpu_name(void) {
+  initialize_cpu_information();
+  char* tmp = NEW_C_HEAP_ARRAY_RETURN_NULL(char, CPU_TYPE_DESC_BUF_SIZE, mtTracing);
+  if (NULL == tmp) {
+    return NULL;
+  }
+  strncpy(tmp, _cpu_name, CPU_TYPE_DESC_BUF_SIZE);
+  return tmp;
+}
+
+const char* VM_Version_Ext::cpu_description(void) {
+  initialize_cpu_information();
+  char* tmp = NEW_C_HEAP_ARRAY_RETURN_NULL(char, CPU_DETAILED_DESC_BUF_SIZE, mtTracing);
+  if (NULL == tmp) {
+    return NULL;
+  }
+  strncpy(tmp, _cpu_desc, CPU_DETAILED_DESC_BUF_SIZE);
+  return tmp;
+}
--- /dev/null	2018-09-25 19:25:36.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vm_version_ext_aarch32.hpp	2018-09-25 19:25:36.000000000 +0300
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2016, 2018, Oracle and/or its affiliates. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH64_VM_VM_VERSION_EXT_AARCH64_HPP
+#define CPU_AARCH64_VM_VM_VERSION_EXT_AARCH64_HPP
+
+#include "utilities/macros.hpp"
+#include "vm_version_aarch32.hpp"
+
+class VM_Version_Ext : public VM_Version {
+ private:
+  static const size_t      CPU_TYPE_DESC_BUF_SIZE = 256;
+  static const size_t      CPU_DETAILED_DESC_BUF_SIZE = 4096;
+
+  static int               _no_of_threads;
+  static int               _no_of_cores;
+  static int               _no_of_sockets;
+  static bool              _initialized;
+  static char              _cpu_name[CPU_TYPE_DESC_BUF_SIZE];
+  static char              _cpu_desc[CPU_DETAILED_DESC_BUF_SIZE];
+
+ public:
+  static int number_of_threads(void);
+  static int number_of_cores(void);
+  static int number_of_sockets(void);
+
+  static const char* cpu_name(void);
+  static const char* cpu_description(void);
+  static void initialize_cpu_information(void);
+
+};
+
+#endif // CPU_AARCH64_VM_VM_VERSION_EXT_AARCH64_HPP
--- /dev/null	2018-09-25 19:25:37.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vmreg_aarch32.cpp	2018-09-25 19:25:37.000000000 +0300
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "code/vmreg.hpp"
+
+void VMRegImpl::set_regName() {
+  int i = 0;
+
+  Register reg = ::as_Register(0);
+  while (i < ConcreteRegisterImpl::max_gpr) {
+    regName[i++] = reg->name();
+    reg = reg->successor();
+  }
+
+  FloatRegister freg = ::as_FloatRegister(0);
+  while (i < ConcreteRegisterImpl::max_fpr) {
+    regName[i++] = freg->name();
+    freg = freg->successor(FloatRegisterImpl::SINGLE);
+  }
+}
--- /dev/null	2018-09-25 19:25:38.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vmreg_aarch32.hpp	2018-09-25 19:25:38.000000000 +0300
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_VMREG_AARCH32_HPP
+#define CPU_AARCH32_VM_VMREG_AARCH32_HPP
+
+  bool is_Register() {
+    // BAD_REG should not pass this test.
+    return (unsigned int) value() <
+           (unsigned int) ConcreteRegisterImpl::max_gpr;
+  }
+
+  bool is_FloatRegister() {
+    return value() >= ConcreteRegisterImpl::max_gpr &&
+           value() < ConcreteRegisterImpl::max_fpr;
+  }
+
+  Register as_Register() {
+    assert(is_Register(), "sanity check");
+    return ::as_Register(value());
+  }
+
+  FloatRegister as_FloatRegister() {
+    assert(is_FloatRegister(), "sanity check");
+    return ::as_FloatRegister(value() - ConcreteRegisterImpl::max_gpr);
+  }
+
+  inline bool is_concrete() {
+    assert(is_reg(), "sanity check");
+    return true;
+  }
+
+#endif // CPU_AARCH32_VM_VMREG_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:39.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vmreg_aarch32.inline.hpp	2018-09-25 19:25:39.000000000 +0300
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef CPU_AARCH32_VM_VMREG_AARCH32_INLINE_HPP
+#define CPU_AARCH32_VM_VMREG_AARCH32_INLINE_HPP
+
+inline VMReg RegisterImpl::as_VMReg() {
+  if (this == noreg) {
+    return VMRegImpl::Bad();
+  }
+  return VMRegImpl::as_VMReg(encoding());
+}
+
+inline VMReg FloatRegisterImpl::as_VMReg() {
+  if (this == fnoreg) {
+    return VMRegImpl::Bad();
+  }
+  return VMRegImpl::as_VMReg(encoding() + ConcreteRegisterImpl::max_gpr);
+}
+
+#endif // CPU_AARCH32_VM_VMREG_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:40.000000000 +0300
+++ new/src/hotspot/cpu/aarch32/vtableStubs_aarch32.cpp	2018-09-25 19:25:40.000000000 +0300
@@ -0,0 +1,256 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "asm/macroAssembler.inline.hpp"
+#include "assembler_aarch32.inline.hpp"
+#include "code/vtableStubs.hpp"
+#include "interp_masm_aarch32.hpp"
+#include "memory/resourceArea.hpp"
+#include "oops/compiledICHolder.hpp"
+#include "oops/instanceKlass.hpp"
+#include "oops/klassVtable.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "vmreg_aarch32.inline.hpp"
+#ifdef COMPILER2
+#include "opto/runtime.hpp"
+#endif
+
+// machine-dependent part of VtableStubs: create VtableStub of correct size and
+// initialize its code
+
+#define __ masm->
+
+#ifndef PRODUCT
+extern "C" void bad_compiled_vtable_index(JavaThread* thread,
+                                          oop receiver,
+                                          int index);
+#endif
+
+VtableStub* VtableStubs::create_vtable_stub(int vtable_index) {
+  const int aarch32_code_length = VtableStub::pd_code_size_limit(true);
+  VtableStub* s = new(aarch32_code_length) VtableStub(true, vtable_index);
+  // Can be NULL if there is no free space in the code cache.
+  if (s == NULL) {
+    return NULL;
+  }
+
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), aarch32_code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+#ifndef PRODUCT
+  if (CountCompiledCalls) {
+    // FIXME SharedRuntime::nof_megamorphic_calls_addr() returns un-encodable address
+    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()), 1);
+  }
+#endif
+
+  // get receiver (need to skip return address on top of stack)
+  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");
+
+  // get receiver klass
+  address npe_addr = __ pc();
+  __ load_klass(rscratch2, j_rarg0);
+
+#ifndef PRODUCT
+  if (DebugVtables) {
+    Label L;
+    // check offset vs vtable length
+    __ ldr(rscratch1, Address(rscratch2, Klass::vtable_length_offset()));
+    __ cmp(rscratch1, vtable_index * vtableEntry::size());
+    __ b(L, Assembler::GT);
+    __ enter();
+    __ mov(r2, vtable_index);
+    __ call_VM(noreg,
+               CAST_FROM_FN_PTR(address, bad_compiled_vtable_index), j_rarg0, r2);
+    __ leave();
+    __ bind(L);
+  }
+#endif // PRODUCT
+
+  __ lookup_virtual_method(rscratch2, vtable_index, rmethod);
+
+  if (DebugVtables) {
+    Label L;
+    __ cbz(rmethod, L);
+    __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+    __ cbnz(rscratch1, L);
+    __ stop("Vtable entry is NULL");
+    __ bind(L);
+  }
+  // r0: receiver klass
+  // rmethod: Method*
+  // r2: receiver
+  address ame_addr = __ pc();
+  __ ldr(rscratch1, Address(rmethod, Method::from_compiled_offset()));
+  __ b(rscratch1);
+
+  __ flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("vtable #%d at "PTR_FORMAT"[%d] left over: %d",
+                  vtable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+
+VtableStub* VtableStubs::create_itable_stub(int itable_index) {
+  // Note well: pd_code_size_limit is the absolute minimum we can get
+  // away with.  If you add code here, bump the code stub size
+  // returned by pd_code_size_limit!
+  const int code_length = VtableStub::pd_code_size_limit(false);
+  VtableStub* s = new(code_length) VtableStub(false, itable_index);
+  // Can be NULL if there is no free space in the code cache.
+  if (s == NULL) {
+    return NULL;
+  }
+
+  ResourceMark rm;
+  CodeBuffer cb(s->entry_point(), code_length);
+  MacroAssembler* masm = new MacroAssembler(&cb);
+
+#ifndef PRODUCT
+  if (CountCompiledCalls) {
+    // FIXME SharedRuntime::nof_megamorphic_calls_addr() returns un-encodable address
+    __ increment(ExternalAddress((address) SharedRuntime::nof_megamorphic_calls_addr()), 1);
+  }
+#endif
+
+  // Entry arguments:
+  //  rscratch2: CompiledICHolder
+  //  j_rarg0: Receiver
+
+  // Most registers are in use; we'll use rmethod, rscratch1, r4
+  // IMPORTANT: r4 is used as a temp register, if it's changed callee-save
+  // the code should be fixed
+  // TODO: put an assert here to ensure r4 is caller-save
+  const Register recv_klass_reg     = rscratch1;
+  const Register holder_klass_reg   = rscratch2; // declaring interface klass (DECC)
+  const Register resolved_klass_reg = rmethod;   // resolved interface klass (REFC)
+  const Register temp_reg           = r4;
+  const Register icholder_reg       = rscratch2;
+
+  __ ldr(resolved_klass_reg, Address(icholder_reg, CompiledICHolder::holder_klass_offset()));
+  // Destroys icholder value
+  __ ldr(holder_klass_reg,   Address(icholder_reg, CompiledICHolder::holder_metadata_offset()));
+
+  Label L_no_such_interface;
+
+  // get receiver klass (also an implicit null-check)
+  address npe_addr = __ pc();
+  assert(VtableStub::receiver_location() == j_rarg0->as_VMReg(), "receiver expected in j_rarg0");
+  __ load_klass(recv_klass_reg, j_rarg0);
+
+  // Receiver subtype check against REFC.
+  // Destroys recv_klass_reg value.
+  __ lookup_interface_method(// inputs: rec. class, interface
+                             recv_klass_reg, resolved_klass_reg, noreg,
+                             // outputs:  scan temp. reg1, scan temp. reg2
+                             recv_klass_reg, temp_reg,
+                             L_no_such_interface,
+                             /*return_method=*/false);
+
+  // Get selected method from declaring class and itable index
+  __ load_klass(recv_klass_reg, j_rarg0); // restore recv_klass_reg
+  __ lookup_interface_method(// inputs: rec. class, interface, itable index
+                             recv_klass_reg, holder_klass_reg, itable_index,
+                             // outputs: method, scan temp. reg
+                             rmethod, temp_reg,
+                             L_no_such_interface);
+  // rmethod: Method*
+  // j_rarg0: receiver
+
+#ifdef ASSERT
+  if (DebugVtables) {
+    Label L2;
+    __ cbz(rmethod, L2);
+    __ ldr(recv_klass_reg, Address(rmethod, Method::from_compiled_offset()));
+    __ cbnz(recv_klass_reg, L2);
+    __ stop("compiler entrypoint is null");
+    __ bind(L2);
+  }
+#endif // ASSERT
+
+  address ame_addr = __ pc();
+  __ ldr(recv_klass_reg, Address(rmethod, Method::from_compiled_offset()));
+  __ b(recv_klass_reg);
+
+  __ bind(L_no_such_interface);
+  // Handle IncompatibleClassChangeError in itable stubs.
+  // More detailed error message.
+  // We force resolving of the call site by jumping to the "handle
+  // wrong method" stub, and so let the interpreter runtime do all the
+  // dirty work.
+  __ far_jump(RuntimeAddress(SharedRuntime::get_handle_wrong_method_stub()));
+
+  __ flush();
+
+  if (PrintMiscellaneous && (WizardMode || Verbose)) {
+    tty->print_cr("itable #%d at "PTR_FORMAT"[%d] left over: %d",
+                  itable_index, p2i(s->entry_point()),
+                  (int)(s->code_end() - s->entry_point()),
+                  (int)(s->code_end() - __ pc()));
+  }
+  guarantee(__ pc() <= s->code_end(), "overflowed buffer");
+
+  s->set_exception_points(npe_addr, ame_addr);
+  return s;
+}
+
+
+int VtableStub::pd_code_size_limit(bool is_vtable_stub) {
+  int size = DebugVtables ? 216 : 0; // FIXME
+  if (CountCompiledCalls)
+    size += 6 * 4; // FIXME. cannot measure, CountCalls does not work
+  if (is_vtable_stub) {
+    size += 26;
+  } else {
+    size += 160;
+    if (!(VM_Version::features() & (FT_ARMV7 | FT_ARMV6T2))) {
+      size += (NativeMovConstReg::mov_n_three_orr_sz - NativeMovConstReg::movw_movt_pair_sz);
+    }
+  }
+  return size;
+
+  // In order to tune these parameters, run the JVM with VM options
+  // +PrintMiscellaneous and +WizardMode to see information about
+  // actual itable stubs.  Run it with -Xmx31G -XX:+UseCompressedOops.
+  //
+  // If Universe::narrow_klass_base is nonzero, decoding a compressed
+  // class can take zeveral instructions.  Run it with -Xmx31G
+  // -XX:+UseCompressedOops.
+  //
+  // The JVM98 app. _202_jess has a megamorphic interface call.
+}
+
+int VtableStub::pd_code_alignment() { return 4; }
--- /dev/null	2018-09-25 19:25:41.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/asm_os_linux_aarch32.s	2018-09-25 19:25:41.000000000 +0300
@@ -0,0 +1,31 @@
+# Copyright 2013-2017 Azul Systems, Inc.  All Rights Reserved.
+# DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+#
+# This code is free software; you can redistribute it and/or modify it under
+# the terms of the GNU General Public License version 2 only, as published by
+# the Free Software Foundation.
+#
+# This code is distributed in the hope that it will be useful, but WITHOUT ANY
+# WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
+# A PARTICULAR PURPOSE.  See the GNU General Public License version 2 for more
+# details (a copy is included in the LICENSE file that accompanied this code).
+#
+# You should have received a copy of the GNU General Public License version 2
+# along with this work; if not, write to the Free Software Foundation, Inc.,
+# 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+#
+# Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+# CA 94089 USA or visit www.azul.com if you need additional information or
+# have any questions.
+
+.global linux_aarch32_current_frame_pointer
+.type linux_aarch32_current_frame_pointer,%function
+linux_aarch32_current_frame_pointer:
+  mov r0, sp
+  bx lr
+
+.global linux_aarch32_previous_frame_pointer
+    .type linux_aarch32_previous_frame_pointer,%function
+linux_aarch32_previous_frame_pointer:
+  mov r0, fp
+  bx lr
--- /dev/null	2018-09-25 19:25:42.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/assembler_linux_aarch32.cpp	2018-09-25 19:25:42.000000000 +0300
@@ -0,0 +1,27 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// nothing required here
--- /dev/null	2018-09-25 19:25:43.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/atomic_linux_aarch32.hpp	2018-09-25 19:25:43.000000000 +0300
@@ -0,0 +1,209 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_ATOMIC_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_ATOMIC_LINUX_AARCH32_INLINE_HPP
+
+#include "runtime/os.hpp"
+#include "vm_version_aarch32.hpp"
+
+// Implementation of class atomic
+
+// various toolchains set different symbols to indicate that ARMv7 architecture is set as a target
+// starting from v7 use more lightweight barrier instructions
+#if (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__)
+#define FULL_MEM_BARRIER  __asm__ __volatile__ ("dmb ish"   : : : "memory")
+#define READ_MEM_BARRIER  __asm__ __volatile__ ("dmb ish"   : : : "memory")
+#define WRITE_MEM_BARRIER __asm__ __volatile__ ("dmb ishst" : : : "memory")
+#else
+#define FULL_MEM_BARRIER  __sync_synchronize()
+#define READ_MEM_BARRIER  __asm__ __volatile__ ("mcr p15,0,r0,c7,c10,5" : : : "memory")
+#define WRITE_MEM_BARRIER __asm__ __volatile__ ("mcr p15,0,r0,c7,c10,5" : : : "memory")
+#endif
+
+template<>
+template<typename T>
+inline T Atomic::PlatformLoad<8>::operator()(T const volatile* src) const {
+  STATIC_ASSERT(8 == sizeof(T));
+// have seen a few toolchains which only set a subset of appropriate defines
+// and as well do not provide atomic API, hence so complicated condition
+#if (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6K__) || (defined(__ARM_FEATURE_LDREX) && (__ARM_FEATURE_LDREX & 8))
+  register long long res;
+  __asm__ __volatile__ (
+      "ldrexd %Q[res], %R[res], [%[addr]]"
+      : [res] "=r" (res)
+      : [addr] "r" (reinterpret_cast<const volatile jlong*>(src))
+      : "memory");
+  return PrimitiveConversions::cast<T>(res);
+#else
+  return PrimitiveConversions::cast<T>(__atomic_load_n(reinterpret_cast<const volatile jlong*>(src),
+                                                       __ATOMIC_RELAXED));
+#endif
+}
+
+template<>
+template<typename T>
+inline void Atomic::PlatformStore<8>::operator()(T store_value,
+                                                 T volatile* dest) const {
+  STATIC_ASSERT(8 == sizeof(T));
+// have seen a few toolchains which only set a subset of appropriate defines
+// and as well do not provide atomic API, hence so complicated condition
+#if (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6K__) || (defined(__ARM_FEATURE_LDREX) && (__ARM_FEATURE_LDREX & 8))
+  // the below is only supported since ARMv6K, adapt otherwise
+  register long long t1;
+  register int t3;
+  __asm__ __volatile__ (
+      "repeat_%=:\n\t"
+      "ldrexd %Q[t1],%R[t1],[%[addr]]\n\t"
+      "strexd %[t3],%Q[val],%R[val],[%[addr]]\n\t"
+      "cmp %[t3],#0\n\t"
+      "bne repeat_%="
+      : [t1] "=&r" (t1),
+        [t3] "=&r" (t3)
+      : [val] "r" (PrimitiveConversions::cast<jlong>(store_value)),
+        [addr] "r" (reinterpret_cast<volatile jlong*>(dest))
+      : "memory");
+#else
+  __atomic_store_n(reinterpret_cast<volatile jlong*>(dest),
+                   PrimitiveConversions::cast<jlong>(store_value), __ATOMIC_RELAXED);
+#endif
+}
+
+template<size_t byte_size>
+struct Atomic::PlatformAdd
+  : Atomic::AddAndFetch<Atomic::PlatformAdd<byte_size> >
+{
+  template<typename I, typename D>
+  D add_and_fetch(I add_value, D volatile* dest, atomic_memory_order order) const;
+};
+
+template<>
+template<typename I, typename D>
+inline D Atomic::PlatformAdd<4>::add_and_fetch(I add_value, D volatile* dest, atomic_memory_order order) const {
+  STATIC_ASSERT(4 == sizeof(I));
+  STATIC_ASSERT(4 == sizeof(D));
+  return __sync_add_and_fetch(dest, add_value);
+}
+
+template<size_t byte_size>
+template<typename T>
+inline T Atomic::PlatformXchg<byte_size>::operator()(T exchange_value,
+                                             T volatile* dest,
+                                             atomic_memory_order order) const {
+  STATIC_ASSERT(byte_size == sizeof(T));
+  T res = __sync_lock_test_and_set(dest, exchange_value);
+  FULL_MEM_BARRIER;
+  return res;
+}
+
+// No direct support for cmpxchg of bytes; emulate using int.
+template<>
+struct Atomic::PlatformCmpxchg<1> : Atomic::CmpxchgByteUsingInt {};
+
+template<>
+template<typename T>
+inline T Atomic::PlatformCmpxchg<4>::operator()(T exchange_value,
+                                                T volatile* dest,
+                                                T compare_value,
+                                                atomic_memory_order order) const {
+  STATIC_ASSERT(4 == sizeof(T));
+  if (order == memory_order_relaxed) {
+    T value = compare_value;
+    __atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
+                              __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    return value;
+  } else {
+    return __sync_val_compare_and_swap(dest, compare_value, exchange_value);
+  }
+}
+
+template<>
+template<typename T>
+inline T Atomic::PlatformCmpxchg<8>::operator()(T exchange_value,
+                                                T volatile* dest,
+                                                T compare_value,
+                                                atomic_memory_order order) const {
+  STATIC_ASSERT(8 == sizeof(T));
+  if (order == memory_order_relaxed) {
+// have seen a few toolchains which only set a subset of appropriate defines
+// and as well do not provide dword CAS, hence so complicated condition
+#if (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6K__) || (defined(__ARM_FEATURE_LDREX) && (__ARM_FEATURE_LDREX & 8))
+    register long long old_value;
+    register int store_result;
+    __asm__ __volatile__ (
+      "mov %[res],#1\n\t"
+      "repeat_%=:\n\t"
+      "ldrexd %Q[old],%R[old],[%[addr]]\n\t"
+      "cmp %Q[old], %Q[cmpr]\n\t"
+      "ittt eq\n\t"
+      "cmpeq %R[old], %R[cmpr]\n\t"
+      "strexdeq %[res],%Q[exch],%R[exch],[%[addr]]\n\t"
+      "cmpeq %[res],#1\n\t"
+      "beq repeat_%="
+      : [old] "=&r" (old_value),
+        [res] "=&r" (store_result)
+      : [exch] "r" (exchange_value),
+        [cmpr] "r" (compare_value),
+        [addr] "r" (dest)
+      : "memory");
+    return old_value;
+#else
+    T value = compare_value;
+    __atomic_compare_exchange(dest, &value, &exchange_value, /*weak*/false,
+                              __ATOMIC_RELAXED, __ATOMIC_RELAXED);
+    return value;
+#endif
+  } else {
+#if (defined(__ARM_ARCH) && __ARM_ARCH >= 7) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6K__) || (defined(__ARM_FEATURE_LDREX) && (__ARM_FEATURE_LDREX & 8))
+    register long long old_value;
+    register int store_result;
+    __asm__ __volatile__ (
+      "dmb ish\n\t"
+      "mov %[res],#1\n\t"
+      "repeat_%=:\n\t"
+      "ldrexd %Q[old],%R[old],[%[addr]]\n\t"
+      "cmp %Q[old], %Q[cmpr]\n\t"
+      "ittt eq\n\t"
+      "cmpeq %R[old], %R[cmpr]\n\t"
+      "strexdeq %[res],%Q[exch],%R[exch],[%[addr]]\n\t"
+      "cmpeq %[res],#1\n\t"
+      "beq repeat_%=\n\t"
+      "dmb ish"
+      : [old] "=&r" (old_value),
+        [res] "=&r" (store_result)
+      : [exch] "r" (exchange_value),
+        [cmpr] "r" (compare_value),
+        [addr] "r" (dest)
+      : "memory");
+    return old_value;
+#else
+    return __sync_val_compare_and_swap(dest, compare_value, exchange_value);
+#endif
+  }
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_ATOMIC_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:44.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/bytes_linux_aarch32.inline.hpp	2018-09-25 19:25:44.000000000 +0300
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_BYTES_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_BYTES_LINUX_AARCH32_INLINE_HPP
+
+#include <byteswap.h>
+
+// Efficient swapping of data bytes from Java byte
+// ordering to native byte ordering and vice versa.
+inline u2   Bytes::swap_u2(u2 x) {
+  return bswap_16(x);
+}
+
+inline u4   Bytes::swap_u4(u4 x) {
+  return bswap_32(x);
+}
+
+inline u8 Bytes::swap_u8(u8 x) {
+  return bswap_64(x);
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_BYTES_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:45.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/copy_linux_aarch32.inline.hpp	2018-09-25 19:25:45.000000000 +0300
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_COPY_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_COPY_LINUX_AARCH32_INLINE_HPP
+
+static void pd_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
+  (void)memmove(to, from, count * HeapWordSize);
+}
+
+static void pd_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
+  switch (count) {
+  case 8:  to[7] = from[7];
+  case 7:  to[6] = from[6];
+  case 6:  to[5] = from[5];
+  case 5:  to[4] = from[4];
+  case 4:  to[3] = from[3];
+  case 3:  to[2] = from[2];
+  case 2:  to[1] = from[1];
+  case 1:  to[0] = from[0];
+  case 0:  break;
+  default:
+    (void)memcpy(to, from, count * HeapWordSize);
+    break;
+  }
+}
+
+static void pd_disjoint_words_atomic(const HeapWord* from, HeapWord* to, size_t count) {
+  switch (count) {
+  case 8:  to[7] = from[7];
+  case 7:  to[6] = from[6];
+  case 6:  to[5] = from[5];
+  case 5:  to[4] = from[4];
+  case 4:  to[3] = from[3];
+  case 3:  to[2] = from[2];
+  case 2:  to[1] = from[1];
+  case 1:  to[0] = from[0];
+  case 0:  break;
+  default:
+    while (count-- > 0) {
+      *to++ = *from++;
+    }
+    break;
+  }
+}
+
+static void pd_aligned_conjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
+  pd_conjoint_words(from, to, count);
+}
+
+static void pd_aligned_disjoint_words(const HeapWord* from, HeapWord* to, size_t count) {
+  pd_disjoint_words(from, to, count);
+}
+
+static void pd_conjoint_bytes(const void* from, void* to, size_t count) {
+  (void)memmove(to, from, count);
+}
+
+static void pd_conjoint_bytes_atomic(const void* from, void* to, size_t count) {
+  pd_conjoint_bytes(from, to, count);
+}
+
+static void pd_conjoint_jshorts_atomic(const jshort* from, jshort* to, size_t count) {
+  _Copy_conjoint_jshorts_atomic(from, to, count);
+}
+
+static void pd_conjoint_jints_atomic(const jint* from, jint* to, size_t count) {
+  _Copy_conjoint_jints_atomic(from, to, count);
+}
+
+static void pd_conjoint_jlongs_atomic(const jlong* from, jlong* to, size_t count) {
+  _Copy_conjoint_jlongs_atomic(from, to, count);
+}
+
+static void pd_conjoint_oops_atomic(const oop* from, oop* to, size_t count) {
+  assert(BytesPerInt == BytesPerOop, "jints and oops must be the same size");
+  _Copy_conjoint_jints_atomic((jint*)from, (jint*)to, count);
+}
+
+static void pd_arrayof_conjoint_bytes(const HeapWord* from, HeapWord* to, size_t count) {
+  _Copy_arrayof_conjoint_bytes(from, to, count);
+}
+
+static void pd_arrayof_conjoint_jshorts(const HeapWord* from, HeapWord* to, size_t count) {
+  _Copy_arrayof_conjoint_jshorts(from, to, count);
+}
+
+static void pd_arrayof_conjoint_jints(const HeapWord* from, HeapWord* to, size_t count) {
+   _Copy_arrayof_conjoint_jints(from, to, count);
+}
+
+static void pd_arrayof_conjoint_jlongs(const HeapWord* from, HeapWord* to, size_t count) {
+  _Copy_arrayof_conjoint_jlongs(from, to, count);
+}
+
+static void pd_arrayof_conjoint_oops(const HeapWord* from, HeapWord* to, size_t count) {
+  assert(BytesPerInt == BytesPerOop, "jints and oops must be the same size");
+  _Copy_arrayof_conjoint_jints(from, to, count);
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_COPY_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:46.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/globals_linux_aarch32.hpp	2018-09-25 19:25:46.000000000 +0300
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_GLOBALS_LINUX_AARCH32_HPP
+#define OS_CPU_LINUX_AARCH32_VM_GLOBALS_LINUX_AARCH32_HPP
+
+// Set the default values for platform dependent flags used by the runtime
+// system (see globals.hpp)
+
+// DontYieldALot should always be set to false on Linux.
+define_pd_global(bool,  DontYieldALot,           false);
+
+// Thread stack sizes are given in Kbytes.
+define_pd_global(intx,  ThreadStackSize,         320);
+define_pd_global(intx,  VMThreadStackSize,       512);
+define_pd_global(intx,  CompilerThreadStackSize, 512);
+
+define_pd_global(uintx, JVMInvokeMethodSlack,    8192);
+
+// HeapBaseMinAddress is used on 64 bit platforms only.
+define_pd_global(uintx, HeapBaseMinAddress,      2*G);
+
+#endif // OS_CPU_LINUX_AARCH32_VM_GLOBALS_LINUX_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:48.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/linux_aarch32.S	2018-09-25 19:25:47.000000000 +0300
@@ -0,0 +1,32 @@
+//
+// Copyright (c) 2003, 2012, Oracle and/or its affiliates. All rights reserved.
+// Copyright (c) 2014, Red Hat Inc. All rights reserved.
+// Copyright (c) 2015, Linaro Ltd. All rights reserved.
+// DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+//
+// This code is free software; you can redistribute it and/or modify it
+// under the terms of the GNU General Public License version 2 only, as
+// published by the Free Software Foundation.
+//
+// This code is distributed in the hope that it will be useful, but WITHOUT
+// ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+// FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+// version 2 for more details (a copy is included in the LICENSE file that
+// accompanied this code).
+//
+// You should have received a copy of the GNU General Public License version
+// 2 along with this work; if not, write to the Free Software Foundation,
+// Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+//
+// Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+// or visit www.oracle.com if you need additional information or have any
+// questions.
+
+// This file is a derivative work resulting from (and including) modifications
+// made by Azul Systems, Inc.  The dates of such changes are 2013-2018.
+// Copyright 2013-2018 Azul Systems, Inc.  All Rights Reserved.
+//
+// Please contact Azul Systems, 385 Moffett Park Drive, Suite 115, Sunnyvale,
+// CA 94089 USA or visit www.azul.com if you need additional information or
+// have any questions.
+
--- /dev/null	2018-09-25 19:25:49.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/orderAccess_linux_aarch32.hpp	2018-09-25 19:25:48.000000000 +0300
@@ -0,0 +1,53 @@
+/*
+ * Copyright (c) 2003, 2016, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_ORDERACCESS_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_ORDERACCESS_LINUX_AARCH32_INLINE_HPP
+
+#include "runtime/orderAccess.hpp"
+#include "runtime/os.hpp"
+#include "vm_version_aarch32.hpp"
+
+// Implementation of class OrderAccess.
+
+inline void OrderAccess::loadload()   { acquire(); }
+inline void OrderAccess::storestore() {
+  WRITE_MEM_BARRIER;
+}
+inline void OrderAccess::loadstore()  { acquire(); }
+inline void OrderAccess::storeload()  { fence(); }
+inline void OrderAccess::acquire() {
+  READ_MEM_BARRIER;
+}
+inline void OrderAccess::release() {
+  READ_MEM_BARRIER;
+}
+inline void OrderAccess::fence() {
+  FULL_MEM_BARRIER;
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_ORDERACCESS_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:50.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/os_linux_aarch32.cpp	2018-09-25 19:25:49.000000000 +0300
@@ -0,0 +1,699 @@
+/*
+ * Copyright (c) 1999, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+// No precompiled headers
+#include "jvm.h"
+#include "asm/macroAssembler.hpp"
+#include "classfile/classLoader.hpp"
+#include "classfile/systemDictionary.hpp"
+#include "classfile/vmSymbols.hpp"
+#include "code/icBuffer.hpp"
+#include "code/vtableStubs.hpp"
+#include "interpreter/interpreter.hpp"
+#include "memory/allocation.inline.hpp"
+#include "nativeInst_aarch32.hpp"
+#include "os_share_linux.hpp"
+#include "prims/jniFastGetField.hpp"
+#include "prims/jvm_misc.hpp"
+#include "runtime/arguments.hpp"
+#include "runtime/extendedPC.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/interfaceSupport.inline.hpp"
+#include "runtime/java.hpp"
+#include "runtime/javaCalls.hpp"
+#include "runtime/mutexLocker.hpp"
+#include "runtime/osThread.hpp"
+#include "runtime/sharedRuntime.hpp"
+#include "runtime/stubRoutines.hpp"
+#include "runtime/thread.inline.hpp"
+#include "runtime/timer.hpp"
+#include "utilities/debug.hpp"
+#include "utilities/events.hpp"
+#include "utilities/vmError.hpp"
+
+// put OS-includes here
+# include <sys/types.h>
+# include <sys/mman.h>
+# include <pthread.h>
+# include <signal.h>
+# include <errno.h>
+# include <dlfcn.h>
+# include <stdlib.h>
+# include <stdio.h>
+# include <unistd.h>
+# include <sys/resource.h>
+# include <pthread.h>
+# include <sys/stat.h>
+# include <sys/time.h>
+# include <sys/utsname.h>
+# include <sys/socket.h>
+# include <sys/wait.h>
+# include <pwd.h>
+# include <poll.h>
+# include <ucontext.h>
+# include <fpu_control.h>
+
+#define SPELL_REG_SP "sp"
+#define SPELL_REG_FP "fp"
+
+extern "C" {
+  void *linux_aarch32_current_frame_pointer();
+  void *linux_aarch32_previous_frame_pointer();
+}
+
+address os::current_stack_pointer() {
+  return (address) linux_aarch32_current_frame_pointer();
+}
+
+char* os::non_memory_address_word() {
+  // Must never look like an address returned by reserve_memory,
+  // even in its subfields (as defined by the CPU immediate fields,
+  // if the CPU splits constants across multiple instructions).
+
+  return (char*) 0xfffffffful;
+}
+
+void os::initialize_thread(Thread *thr) {
+}
+
+address os::Linux::ucontext_get_pc(const ucontext_t * uc) {
+  return (address)uc->uc_mcontext.arm_pc;
+}
+
+void os::Linux::ucontext_set_pc(ucontext_t * uc, address pc) {
+  uc->uc_mcontext.arm_pc = (intptr_t)pc;
+}
+
+intptr_t* os::Linux::ucontext_get_sp(const ucontext_t * uc) {
+  return (intptr_t*)uc->uc_mcontext.arm_sp;
+}
+
+intptr_t* os::Linux::ucontext_get_fp(const ucontext_t * uc) {
+  return (intptr_t*)uc->uc_mcontext.arm_fp;
+}
+
+// For Forte Analyzer AsyncGetCallTrace profiling support - thread
+// is currently interrupted by SIGPROF.
+// os::Solaris::fetch_frame_from_ucontext() tries to skip nested signal
+// frames. Currently we don't do that on Linux, so it's the same as
+// os::fetch_frame_from_context().
+ExtendedPC os::Linux::fetch_frame_from_ucontext(Thread* thread,
+  const ucontext_t* uc, intptr_t** ret_sp, intptr_t** ret_fp) {
+
+  assert(thread != NULL, "just checking");
+  assert(ret_sp != NULL, "just checking");
+  assert(ret_fp != NULL, "just checking");
+
+  return os::fetch_frame_from_context(uc, ret_sp, ret_fp);
+}
+
+ExtendedPC os::fetch_frame_from_context(const void* ucVoid,
+                    intptr_t** ret_sp, intptr_t** ret_fp) {
+
+  ExtendedPC  epc;
+  ucontext_t* uc = (ucontext_t*)ucVoid;
+
+  if (uc != NULL) {
+    epc = ExtendedPC(os::Linux::ucontext_get_pc(uc));
+    if (ret_sp) *ret_sp = os::Linux::ucontext_get_sp(uc);
+    if (ret_fp) *ret_fp = os::Linux::ucontext_get_fp(uc);
+  } else {
+    // construct empty ExtendedPC for return value checking
+    epc = ExtendedPC(NULL);
+    if (ret_sp) *ret_sp = (intptr_t *)NULL;
+    if (ret_fp) *ret_fp = (intptr_t *)NULL;
+  }
+
+  return epc;
+}
+
+frame os::fetch_frame_from_context(const void* ucVoid) {
+  intptr_t* sp;
+  intptr_t* fp;
+  ExtendedPC epc = fetch_frame_from_context(ucVoid, &sp, &fp);
+  return frame(sp, fp, epc.pc());
+}
+
+bool os::Linux::get_frame_at_stack_banging_point(JavaThread* thread, ucontext_t* uc, frame* fr) {
+  address pc = (address) os::Linux::ucontext_get_pc(uc);
+  if (Interpreter::contains(pc)) {
+    // interpreter performs stack banging after the fixed frame header has
+    // been generated while the compilers perform it before. To maintain
+    // semantic consistency between interpreted and compiled frames, the
+    // method returns the Java sender of the current frame.
+    *fr = os::fetch_frame_from_context(uc);
+    if (!fr->is_first_java_frame()) {
+      assert(fr->safe_for_sender(thread), "Safety check");
+      *fr = fr->java_sender();
+    }
+  } else {
+    // more complex code with compiled code
+    assert(!Interpreter::contains(pc), "Interpreted methods should have been handled above");
+    CodeBlob* cb = CodeCache::find_blob(pc);
+    if (cb == NULL || !cb->is_nmethod() || cb->is_frame_complete_at(pc)) {
+      // Not sure where the pc points to, fallback to default
+      // stack overflow handling
+      return false;
+    } else {
+      // In compiled code, the stack banging is performed before LR
+      // has been saved in the frame.  LR is live, and SP and FP
+      // belong to the caller.
+      intptr_t* fp = os::Linux::ucontext_get_fp(uc);
+      intptr_t* sp = os::Linux::ucontext_get_sp(uc);
+      //TODO: XXX: Merge
+      // could be pc = os::Linux::ucontext_get_pc(uc) ?
+      address pc = (address)(uc->uc_mcontext.arm_lr
+                         - NativeInstruction::arm_insn_sz);
+      *fr = frame(sp, fp, pc);
+      if (!fr->is_java_frame()) {
+        assert(fr->safe_for_sender(thread), "Safety check");
+        assert(!fr->is_first_frame(), "Safety check");
+        *fr = fr->java_sender();
+      }
+    }
+  }
+  assert(fr->is_java_frame(), "Safety check");
+  return true;
+}
+
+// By default, gcc always saves frame pointer rfp on this stack. This
+// may get turned off by -fomit-frame-pointer.
+frame os::get_sender_for_C_frame(frame* fr) {
+#ifdef __thumb__
+  return frame();
+#else
+  address sender_pc = *(address*) fr->addr_at(fr->get_return_addr_offset(JNIFrameAPCS));
+  intptr_t* link = *(intptr_t**) fr->addr_at(fr->get_link_offset(JNIFrameAPCS));
+  return frame(fr->sender_sp(), link, sender_pc);
+#endif
+}
+
+frame os::current_frame() {
+#ifdef __thumb__
+  return frame();
+#else
+  intptr_t* fp = (intptr_t*)linux_aarch32_previous_frame_pointer();
+  frame myframe((intptr_t*)os::current_stack_pointer(),
+                (intptr_t*)fp,
+                CAST_FROM_FN_PTR(address, os::current_frame));
+  // check for C2 frame first, those to not have valid FP
+  if (!CodeCache::find_blob(
+          *(address*)myframe.addr_at(myframe.get_return_addr_offset(JNIFrameAPCS))) &&
+          os::is_first_C_frame(&myframe)) {
+    // stack is not walkable
+    return frame();
+  } else {
+    return os::get_sender_for_C_frame(&myframe);
+  }
+#endif
+}
+
+// Utility functions
+
+// From IA32 System Programming Guide
+enum {
+  trap_page_fault = 0xE
+};
+
+// An operation in Unsafe has faulted.  We're going to return to the
+// instruction after the faulting load or store.  We also set
+// pending_unsafe_access_error so that at some point in the future our
+// user will get a helpful message.
+static address handle_unsafe_access(JavaThread* thread, address pc) {
+  // pc is the instruction which we must emulate
+  // doing a no-op is fine:  return garbage from the load
+  // therefore, compute npc
+  address npc = pc + NativeInstruction::arm_insn_sz;
+
+  // request an async exception
+  thread->set_pending_unsafe_access_error();
+
+  // return address of next instruction to execute
+  return npc;
+}
+
+extern "C" JNIEXPORT int
+JVM_handle_linux_signal(int sig,
+                        siginfo_t* info,
+                        void* ucVoid,
+                        int abort_if_unrecognized) {
+  ucontext_t* uc = (ucontext_t*) ucVoid;
+
+  Thread* t = Thread::current_or_null_safe();
+
+  // Must do this before SignalHandlerMark, if crash protection installed we will longjmp away
+  // (no destructors can be run)
+  os::ThreadCrashProtection::check_crash_protection(sig, t);
+
+  SignalHandlerMark shm(t);
+
+  // Note: it's not uncommon that JNI code uses signal/sigset to install
+  // then restore certain signal handler (e.g. to temporarily block SIGPIPE,
+  // or have a SIGILL handler when detecting CPU type). When that happens,
+  // JVM_handle_linux_signal() might be invoked with junk info/ucVoid. To
+  // avoid unnecessary crash when libjsig is not preloaded, try handle signals
+  // that do not require siginfo/ucontext first.
+
+  if (sig == SIGPIPE || sig == SIGXFSZ) {
+    // allow chained handler to go first
+    if (os::Linux::chained_handler(sig, info, ucVoid)) {
+      return true;
+    } else {
+      if (PrintMiscellaneous && (WizardMode || Verbose)) {
+        char buf[64];
+        warning("Ignoring %s - see bugs 4229104 or 646499219",
+                os::exception_name(sig, buf, sizeof(buf)));
+      }
+      return true;
+    }
+  }
+
+#ifdef CAN_SHOW_REGISTERS_ON_ASSERT
+  if ((sig == SIGSEGV || sig == SIGBUS) && info != NULL && info->si_addr == g_assert_poison) {
+    handle_assert_poison_fault(ucVoid, info->si_addr);
+    return 1;
+  }
+#endif
+
+  JavaThread* thread = NULL;
+  VMThread* vmthread = NULL;
+  if (os::Linux::signal_handlers_are_installed) {
+    if (t != NULL ){
+      if(t->is_Java_thread()) {
+        thread = (JavaThread*)t;
+      }
+      else if(t->is_VM_thread()){
+        vmthread = (VMThread *)t;
+      }
+    }
+  }
+/*
+  NOTE: does not seem to work on linux.
+  if (info == NULL || info->si_code <= 0 || info->si_code == SI_NOINFO) {
+    // can't decode this kind of signal
+    info = NULL;
+  } else {
+    assert(sig == info->si_signo, "bad siginfo");
+  }
+*/
+  // decide if this trap can be handled by a stub
+  address stub = NULL;
+
+  address pc          = NULL;
+
+  //%note os_trap_1
+  if (info != NULL && uc != NULL && thread != NULL) {
+    pc = (address) os::Linux::ucontext_get_pc(uc);
+
+    if (StubRoutines::is_safefetch_fault(pc)) {
+      os::Linux::ucontext_set_pc(uc, StubRoutines::continuation_for_safefetch_fault(pc));
+      return 1;
+    }
+
+    // Handle ALL stack overflow variations here
+    if (sig == SIGSEGV) {
+      address addr = (address) info->si_addr;
+
+      // check if fault address is within thread stack
+      if (thread->on_local_stack(addr)) {
+        // stack overflow
+        if (thread->in_stack_yellow_reserved_zone(addr)) {
+          thread->disable_stack_yellow_reserved_zone();
+          if (thread->thread_state() == _thread_in_Java) {
+            if (thread->in_stack_reserved_zone(addr)) {
+              frame fr;
+              if (os::Linux::get_frame_at_stack_banging_point(thread, uc, &fr)) {
+                assert(fr.is_java_frame(), "Must be a Java frame");
+                frame activation =
+                  SharedRuntime::look_for_reserved_stack_annotated_method(thread, fr);
+                if (activation.sp() != NULL) {
+                  thread->disable_stack_reserved_zone();
+                  if (activation.is_interpreted_frame()) {
+                    thread->set_reserved_stack_activation((address)(
+                      activation.fp() + frame::interpreter_frame_initial_sp_offset));
+                  } else {
+                    thread->set_reserved_stack_activation((address)activation.unextended_sp());
+                  }
+                  return 1;
+                }
+              }
+            }
+            // Throw a stack overflow exception.  Guard pages will be reenabled
+            // while unwinding the stack.
+            stub = SharedRuntime::continuation_for_implicit_exception(thread, pc, SharedRuntime::STACK_OVERFLOW);
+          } else {
+            // Thread was in the vm or native code.  Return and try to finish.
+            return 1;
+          }
+        } else if (thread->in_stack_red_zone(addr)) {
+          // Fatal red zone violation.  Disable the guard pages and fall through
+          // to handle_unexpected_exception way down below.
+          thread->disable_stack_red_zone();
+          tty->print_raw_cr("An irrecoverable stack overflow has occurred.");
+
+          // This is a likely cause, but hard to verify. Let's just print
+          // it as a hint.
+          tty->print_raw_cr("Please check if any of your loaded .so files has "
+                            "enabled executable stack (see man page execstack(8))");
+        } else {
+          // Accessing stack address below sp may cause SEGV if current
+          // thread has MAP_GROWSDOWN stack. This should only happen when
+          // current thread was created by user code with MAP_GROWSDOWN flag
+          // and then attached to VM. See notes in os_linux.cpp.
+          if (thread->osthread()->expanding_stack() == 0) {
+             thread->osthread()->set_expanding_stack();
+             if (os::Linux::manually_expand_stack(thread, addr)) {
+               thread->osthread()->clear_expanding_stack();
+               return 1;
+             }
+             thread->osthread()->clear_expanding_stack();
+          } else {
+             fatal("recursive segv. expanding stack.");
+          }
+        }
+      }
+    }
+
+    if (thread->thread_state() == _thread_in_Java) {
+      // Java thread running in Java code => find exception handler if any
+      // a fault inside compiled code, the interpreter, or a stub
+
+      // Handle signal from NativeJump::patch_verified_entry().
+      if ((sig == SIGILL || sig == SIGTRAP)
+          && nativeInstruction_at(pc)->is_sigill_zombie_not_entrant()) {
+        if (TraceTraps) {
+          tty->print_cr("trap: zombie_not_entrant (%s)", (sig == SIGTRAP) ? "SIGTRAP" : "SIGILL");
+        }
+        stub = SharedRuntime::get_handle_wrong_method_stub();
+      } else if (sig == SIGSEGV && os::is_poll_address((address)info->si_addr)) {
+        stub = SharedRuntime::get_poll_stub(pc);
+      } else if (sig == SIGBUS /* && info->si_code == BUS_OBJERR */) {
+        // BugId 4454115: A read from a MappedByteBuffer can fault
+        // here if the underlying file has been truncated.
+        // Do not crash the VM in such a case.
+        CodeBlob* cb = CodeCache::find_blob_unsafe(pc);
+        CompiledMethod* nm = (cb != NULL) ? cb->as_compiled_method_or_null() : NULL;
+        if (nm != NULL && nm->has_unsafe_access()) {
+          stub = handle_unsafe_access(thread, pc);
+        }
+      }
+      else
+
+      if (sig == SIGFPE  &&
+          (info->si_code == FPE_INTDIV || info->si_code == FPE_FLTDIV)) {
+        stub =
+          SharedRuntime::
+          continuation_for_implicit_exception(thread,
+                                              pc,
+                                              SharedRuntime::
+                                              IMPLICIT_DIVIDE_BY_ZERO);
+      } else if (sig == SIGSEGV &&
+               !MacroAssembler::needs_explicit_null_check((intptr_t)info->si_addr)) {
+          // Determination of interpreter/vtable stub/compiled code null exception
+          stub = SharedRuntime::continuation_for_implicit_exception(thread, pc, SharedRuntime::IMPLICIT_NULL);
+      }
+    } else if (sig == SIGILL && VM_Version::is_determine_features_test_running()) {
+        // SIGILL must be caused by VM_Version::get_processor_features().
+        *(int *)pc = Assembler::nop_insn; // patch instruction to NOP to indicate that it causes a SIGILL,
+                        // flushing of icache is not necessary.
+        stub = pc + 4;  // continue with next instruction.
+    } else if (thread->thread_state() == _thread_in_vm &&
+               sig == SIGBUS && /* info->si_code == BUS_OBJERR && */
+               thread->doing_unsafe_access()) {
+        stub = handle_unsafe_access(thread, pc);
+    }
+
+    // jni_fast_Get<Primitive>Field can trap at certain pc's if a GC kicks in
+    // and the heap gets shrunk before the field access.
+    if ((sig == SIGSEGV) || (sig == SIGBUS)) {
+      address addr = JNI_FastGetField::find_slowcase_pc(pc);
+      if (addr != (address)-1) {
+        stub = addr;
+      }
+    }
+
+    // Check to see if we caught the safepoint code in the
+    // process of write protecting the memory serialization page.
+    // It write enables the page immediately after protecting it
+    // so we can just return to retry the write.
+    if ((sig == SIGSEGV) &&
+        os::is_memory_serialize_page(thread, (address) info->si_addr)) {
+      // Block current thread until the memory serialize page permission restored.
+      os::block_on_serialize_page_trap();
+      return true;
+    }
+  }
+
+  if (stub != NULL) {
+    // save all thread context in case we need to restore it
+    if (thread != NULL) thread->set_saved_exception_pc(pc);
+
+    os::Linux::ucontext_set_pc(uc, stub);
+    return true;
+  }
+
+  // signal-chaining
+  if (os::Linux::chained_handler(sig, info, ucVoid)) {
+     return true;
+  }
+
+  if (!abort_if_unrecognized) {
+    // caller wants another chance, so give it to him
+    return false;
+  }
+
+  if (pc == NULL && uc != NULL) {
+    pc = os::Linux::ucontext_get_pc(uc);
+  }
+
+  // unmask current signal
+  sigset_t newset;
+  sigemptyset(&newset);
+  sigaddset(&newset, sig);
+  sigprocmask(SIG_UNBLOCK, &newset, NULL);
+
+  VMError::report_and_die(t, sig, pc, info, ucVoid);
+
+  ShouldNotReachHere();
+  return true; // Mute compiler
+}
+
+void os::Linux::init_thread_fpu_state(void) {
+}
+
+int os::Linux::get_fpu_control_word(void) {
+  return 0;
+}
+
+void os::Linux::set_fpu_control_word(int fpu_control) {
+}
+
+// Check that the linux kernel version is 2.4 or higher since earlier
+// versions do not support SSE without patches.
+bool os::supports_sse() {
+  return true;
+}
+
+bool os::is_allocatable(size_t bytes) {
+  return true;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+// thread stack
+
+// Minimum usable stack sizes required to get to user code. Space for
+// HotSpot guard pages is added later.
+size_t os::Posix::_compiler_thread_min_stack_allowed = (32 DEBUG_ONLY(+ 4)) * K;
+size_t os::Posix::_java_thread_min_stack_allowed = (32 DEBUG_ONLY(+ 4)) * K;
+size_t os::Posix::_vm_internal_thread_min_stack_allowed = (48 DEBUG_ONLY(+ 4)) * K;
+
+// return default stack size for thr_type
+size_t os::Posix::default_stack_size(os::ThreadType thr_type) {
+  // default stack size (compiler thread needs larger stack)
+  size_t s = (thr_type == os::compiler_thread ? 2 * M : 512 * K);
+  return s;
+}
+
+// Java thread:
+//
+//   Low memory addresses
+//    +------------------------+
+//    |                        |\  JavaThread created by VM does not have glibc
+//    |    glibc guard page    | - guard, attached Java thread usually has
+//    |                        |/  1 page glibc guard.
+// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
+//    |                        |\
+//    |  HotSpot Guard Pages   | - red and yellow pages
+//    |                        |/
+//    +------------------------+ JavaThread::stack_yellow_zone_base()
+//    |                        |\
+//    |      Normal Stack      | -
+//    |                        |/
+// P2 +------------------------+ Thread::stack_base()
+//
+// Non-Java thread:
+//
+//   Low memory addresses
+//    +------------------------+
+//    |                        |\
+//    |  glibc guard page      | - usually 1 page
+//    |                        |/
+// P1 +------------------------+ Thread::stack_base() - Thread::stack_size()
+//    |                        |\
+//    |      Normal Stack      | -
+//    |                        |/
+// P2 +------------------------+ Thread::stack_base()
+//
+// ** P1 (aka bottom) and size ( P2 = P1 - size) are the address and stack size returned from
+//    pthread_attr_getstack()
+
+/////////////////////////////////////////////////////////////////////////////
+// helper functions for fatal error handler
+
+
+void os::print_context(outputStream *st, const void *context) {
+  if (context == NULL) return;
+
+  ucontext_t *uc = (ucontext_t*)context;
+  st->print_cr("Registers:");
+
+  for (int r = 0; r < 16; r++)
+    st->print_cr(  "R%d=" INTPTR_FORMAT, r,  *((unsigned int*)&uc->uc_mcontext.arm_r0 + r) );
+
+  st->cr();
+
+  intptr_t *sp = (intptr_t *)os::Linux::ucontext_get_sp(uc);
+  st->print_cr("Top of Stack: (sp=" PTR_FORMAT ")", p2i(sp));
+  print_hex_dump(st, (address)sp, (address)(sp + 8*sizeof(intptr_t)), sizeof(intptr_t));
+  st->cr();
+
+  // Note: it may be unsafe to inspect memory near pc. For example, pc may
+  // point to garbage if entry point in an nmethod is corrupted. Leave
+  // this at the end, and hope for the best.
+  address pc = os::Linux::ucontext_get_pc(uc);
+  st->print_cr("Instructions: (pc=" PTR_FORMAT ")", p2i(pc));
+  print_hex_dump(st, pc - 32, pc + 32, sizeof(char));
+}
+
+void os::print_register_info(outputStream *st, const void *context) {
+  if (context == NULL) return;
+
+  ucontext_t *uc = (ucontext_t*)context;
+
+  st->print_cr("Register to memory mapping:");
+  st->cr();
+
+  for (int r = 0; r < 16; r++) {
+    st->print(  "R%d=", r); print_location(st, *((unsigned int*)&uc->uc_mcontext.arm_r0 + r));
+  }
+  st->cr();
+}
+
+void os::setup_fpu() {
+}
+
+#ifndef PRODUCT
+void os::verify_stack_alignment() {
+  assert(((intptr_t)os::current_stack_pointer() & (StackAlignmentInBytes-1)) == 0, "incorrect stack alignment");
+}
+#endif
+
+int os::extra_bang_size_in_bytes() {
+  // AArch64 does not require the additional stack bang.
+  // does AArch32?
+  return 0;
+}
+
+extern "C" {
+  int SpinPause() {
+    return 0;
+  }
+
+  void _Copy_conjoint_jshorts_atomic(jshort* from, jshort* to, size_t count) {
+    if (from > to) {
+      jshort *end = from + count;
+      while (from < end)
+        *(to++) = *(from++);
+    }
+    else if (from < to) {
+      jshort *end = from;
+      from += count - 1;
+      to   += count - 1;
+      while (from >= end)
+        *(to--) = *(from--);
+    }
+  }
+  void _Copy_conjoint_jints_atomic(jint* from, jint* to, size_t count) {
+    if (from > to) {
+      jint *end = from + count;
+      while (from < end)
+        *(to++) = *(from++);
+    }
+    else if (from < to) {
+      jint *end = from;
+      from += count - 1;
+      to   += count - 1;
+      while (from >= end)
+        *(to--) = *(from--);
+    }
+  }
+  void _Copy_conjoint_jlongs_atomic(jlong* from, jlong* to, size_t count) {
+    if (from > to) {
+      jlong *end = from + count;
+      while (from < end)
+        os::atomic_copy64(from++, to++);
+    }
+    else if (from < to) {
+      jlong *end = from;
+      from += count - 1;
+      to   += count - 1;
+      while (from >= end)
+        os::atomic_copy64(from--, to--);
+    }
+  }
+
+  void _Copy_arrayof_conjoint_bytes(HeapWord* from,
+                                    HeapWord* to,
+                                    size_t    count) {
+    memmove(to, from, count);
+  }
+  void _Copy_arrayof_conjoint_jshorts(HeapWord* from,
+                                      HeapWord* to,
+                                      size_t    count) {
+    memmove(to, from, count * 2);
+  }
+  void _Copy_arrayof_conjoint_jints(HeapWord* from,
+                                    HeapWord* to,
+                                    size_t    count) {
+    memmove(to, from, count * 4);
+  }
+  void _Copy_arrayof_conjoint_jlongs(HeapWord* from,
+                                     HeapWord* to,
+                                     size_t    count) {
+    memmove(to, from, count * 8);
+  }
+};
--- /dev/null	2018-09-25 19:25:51.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/os_linux_aarch32.hpp	2018-09-25 19:25:50.000000000 +0300
@@ -0,0 +1,46 @@
+/*
+ * Copyright (c) 1999, 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_HPP
+#define OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_HPP
+
+  static void setup_fpu();
+  static bool supports_sse();
+
+  static jlong rdtsc();
+
+  static bool is_allocatable(size_t bytes);
+
+  // Used to register dynamic code cache area with the OS
+  // Note: Currently only used in 64 bit Windows implementations
+  static bool register_code_area(char *low, char *high) { return true; }
+
+  // Atomically copy 64 bits of data
+  static void atomic_copy64(const volatile void *src, volatile void *dst) {
+    *(jlong *) dst = *(const jlong *) src;
+  }
+
+#endif // OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:52.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/os_linux_aarch32.inline.hpp	2018-09-25 19:25:51.000000000 +0300
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2011, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_INLINE_HPP
+
+#include "runtime/os.hpp"
+
+// See http://www.technovelty.org/code/c/reading-rdtsc.htl for details
+inline jlong os::rdtsc() {
+  uint64_t res;
+  uint32_t ts1, ts2;
+  __asm__ __volatile__ ("rdtsc" : "=a" (ts1), "=d" (ts2));
+  res = ((uint64_t)ts1 | (uint64_t)ts2 << 32);
+  return (jlong)res;
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_OS_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:53.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/prefetch_linux_aarch32.inline.hpp	2018-09-25 19:25:52.000000000 +0300
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_PREFETCH_LINUX_AARCH32_INLINE_HPP
+#define OS_CPU_LINUX_AARCH32_VM_PREFETCH_LINUX_AARCH32_INLINE_HPP
+
+#include "runtime/prefetch.hpp"
+
+
+inline void Prefetch::read (void *loc, intx interval) {
+//FIXME Put this back
+//  if (interval >= 0)
+//    asm("prfm PLDL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
+//FIXME END
+}
+
+inline void Prefetch::write(void *loc, intx interval) {
+//FIXME Put this back
+//  if (interval >= 0)
+//    asm("prfm PSTL1KEEP, [%0, %1]" : : "r"(loc), "r"(interval));
+//FIXME END
+}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_PREFETCH_LINUX_AARCH32_INLINE_HPP
--- /dev/null	2018-09-25 19:25:54.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/thread_linux_aarch32.cpp	2018-09-25 19:25:53.000000000 +0300
@@ -0,0 +1,105 @@
+/*
+ * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "memory/metaspaceShared.hpp"
+#include "runtime/frame.inline.hpp"
+#include "runtime/thread.inline.hpp"
+
+frame JavaThread::pd_last_frame() {
+  assert(has_last_Java_frame(), "must have last_Java_sp() when suspended");
+  return frame(_anchor.last_Java_sp(), _anchor.last_Java_fp(), _anchor.last_Java_pc());
+}
+
+// For Forte Analyzer AsyncGetCallTrace profiling support - thread is
+// currently interrupted by SIGPROF
+bool JavaThread::pd_get_top_frame_for_signal_handler(frame* fr_addr,
+  void* ucontext, bool isInJava) {
+
+  assert(Thread::current() == this, "caller must be current thread");
+  return pd_get_top_frame(fr_addr, ucontext, isInJava);
+}
+
+bool JavaThread::pd_get_top_frame_for_profiling(frame* fr_addr, void* ucontext, bool isInJava) {
+  return pd_get_top_frame(fr_addr, ucontext, isInJava);
+}
+
+bool JavaThread::pd_get_top_frame(frame* fr_addr, void* ucontext, bool isInJava) {
+  assert(this->is_Java_thread(), "must be JavaThread");
+  JavaThread* jt = (JavaThread *)this;
+
+  // If we have a last_Java_frame, then we should use it even if
+  // isInJava == true.  It should be more reliable than ucontext info.
+  if (jt->has_last_Java_frame() && jt->frame_anchor()->walkable()) {
+    *fr_addr = jt->pd_last_frame();
+    return true;
+  }
+
+  // At this point, we don't have a last_Java_frame, so
+  // we try to glean some information out of the ucontext
+  // if we were running Java code when SIGPROF came in.
+  if (isInJava) {
+    ucontext_t* uc = (ucontext_t*) ucontext;
+
+    intptr_t* ret_fp;
+    intptr_t* ret_sp;
+    ExtendedPC addr = os::Linux::fetch_frame_from_ucontext(this, uc,
+      &ret_sp, &ret_fp);
+    if (addr.pc() == NULL || ret_sp == NULL ) {
+      // ucontext wasn't useful
+      return false;
+    }
+
+    if (MetaspaceShared::is_in_trampoline_frame(addr.pc())) {
+      // In the middle of a trampoline call. Bail out for safety.
+      // This happens rarely so shouldn't affect profiling.
+      return false;
+    }
+
+    frame ret_frame(ret_sp, ret_fp, addr.pc());
+    if (!ret_frame.safe_for_sender(jt)) {
+#ifdef COMPILER2
+      frame ret_frame2(ret_sp, NULL, addr.pc());
+      if (!ret_frame2.safe_for_sender(jt)) {
+        // nothing else to try if the frame isn't good
+        return false;
+      }
+      ret_frame = ret_frame2;
+#else
+      // nothing else to try if the frame isn't good
+      return false;
+#endif /* COMPILER2 */
+    }
+    *fr_addr = ret_frame;
+    return true;
+  }
+
+  // nothing else to try
+  return false;
+}
+
+void JavaThread::cache_global_variables() { }
+
--- /dev/null	2018-09-25 19:25:55.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/thread_linux_aarch32.hpp	2018-09-25 19:25:54.000000000 +0300
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2000, 2018, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_THREAD_LINUX_AARCH32_HPP
+#define OS_CPU_LINUX_AARCH32_VM_THREAD_LINUX_AARCH32_HPP
+
+ private:
+#ifdef ASSERT
+  // spill stack holds N callee-save registers at each Java call and
+  // grows downwards towards limit
+  // we need limit to check we have space for a spill and base so we
+  // can identify all live spill frames at GC (eventually)
+  address          _spill_stack;
+  address          _spill_stack_base;
+  address          _spill_stack_limit;
+#endif // ASSERT
+
+  void pd_initialize() {
+    _anchor.clear();
+  }
+
+  frame pd_last_frame();
+
+ public:
+  // Mutators are highly dangerous....
+  intptr_t* last_Java_fp()                       { return _anchor.last_Java_fp(); }
+  void  set_last_Java_fp(intptr_t* fp)           { _anchor.set_last_Java_fp(fp);   }
+
+  void set_base_of_stack_pointer(intptr_t* base_sp) {
+  }
+
+  static ByteSize last_Java_fp_offset()          {
+    return byte_offset_of(JavaThread, _anchor) + JavaFrameAnchor::last_Java_fp_offset();
+  }
+
+  intptr_t* base_of_stack_pointer() {
+    return NULL;
+  }
+  void record_base_of_stack_pointer() {
+  }
+
+  bool pd_get_top_frame_for_signal_handler(frame* fr_addr, void* ucontext,
+    bool isInJava);
+
+  bool pd_get_top_frame_for_profiling(frame* fr_addr, void* ucontext, bool isInJava);
+private:
+  bool pd_get_top_frame(frame* fr_addr, void* ucontext, bool isInJava);
+public:
+
+  // These routines are only used on cpu architectures that
+  // have separate register stacks (Itanium).
+  static bool register_stack_overflow() { return false; }
+  static void enable_register_stack_guard() {}
+  static void disable_register_stack_guard() {}
+
+#endif // OS_CPU_LINUX_AARCH32_VM_THREAD_LINUX_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:56.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/vmStructs_linux_aarch32.hpp	2018-09-25 19:25:55.000000000 +0300
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2000, 2012, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#ifndef OS_CPU_LINUX_AARCH32_VM_VMSTRUCTS_LINUX_AARCH32_HPP
+#define OS_CPU_LINUX_AARCH32_VM_VMSTRUCTS_LINUX_AARCH32_HPP
+
+// These are the OS and CPU-specific fields, types and integer
+// constants required by the Serviceability Agent. This file is
+// referenced by vmStructs.cpp.
+
+#define VM_STRUCTS_OS_CPU(nonstatic_field, static_field, unchecked_nonstatic_field, volatile_nonstatic_field, nonproduct_nonstatic_field, c2_nonstatic_field, unchecked_c1_static_field, unchecked_c2_static_field) \
+                                                                                                                                     \
+  /******************************/                                                                                                   \
+  /* Threads (NOTE: incomplete) */                                                                                                   \
+  /******************************/                                                                                                   \
+  nonstatic_field(OSThread,                      _thread_id,                                      OSThread::thread_id_t)             \
+  nonstatic_field(OSThread,                      _pthread_id,                                     pthread_t)
+
+
+#define VM_TYPES_OS_CPU(declare_type, declare_toplevel_type, declare_oop_type, declare_integer_type, declare_unsigned_integer_type, declare_c1_toplevel_type, declare_c2_type, declare_c2_toplevel_type) \
+                                                                          \
+  /**********************/                                                \
+  /* Posix Thread IDs   */                                                \
+  /**********************/                                                \
+                                                                          \
+  declare_integer_type(OSThread::thread_id_t)                             \
+  declare_unsigned_integer_type(pthread_t)
+
+#define VM_INT_CONSTANTS_OS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#define VM_LONG_CONSTANTS_OS_CPU(declare_constant, declare_preprocessor_constant, declare_c1_constant, declare_c2_constant, declare_c2_preprocessor_constant)
+
+#endif // OS_CPU_LINUX_AARCH32_VM_VMSTRUCTS_LINUX_AARCH32_HPP
--- /dev/null	2018-09-25 19:25:57.000000000 +0300
+++ new/src/hotspot/os_cpu/linux_aarch32/vm_version_linux_aarch32.cpp	2018-09-25 19:25:56.000000000 +0300
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2014, Red Hat Inc. All rights reserved.
+ * Copyright (c) 2015, Linaro Ltd. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+#include "precompiled.hpp"
+#include "runtime/os.hpp"
+#include "vm_version_aarch32.hpp"
+
--- /dev/null	2018-09-25 19:25:58.000000000 +0300
+++ new/src/jdk.hotspot.agent/share/classes/sun/jvm/hotspot/debugger/MachineDescriptionArm.java	2018-09-25 19:25:57.000000000 +0300
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2003, 2014, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2015-2018, Azul Systems, Inc. All rights reserved.
+ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
+ *
+ * This code is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 only, as
+ * published by the Free Software Foundation.
+ *
+ * This code is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+ * version 2 for more details (a copy is included in the LICENSE file that
+ * accompanied this code).
+ *
+ * You should have received a copy of the GNU General Public License version
+ * 2 along with this work; if not, write to the Free Software Foundation,
+ * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
+ *
+ * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
+ * or visit www.oracle.com if you need additional information or have any
+ * questions.
+ *
+ */
+
+package sun.jvm.hotspot.debugger;
+
+public class MachineDescriptionArm extends MachineDescriptionTwosComplement implements MachineDescription {
+  public long getAddressSize() {
+    return 4;
+  }
+
+  public boolean isLP64() {
+    return false;
+  }
+
+  public boolean isBigEndian() {
+    return false;
+  }
+}